diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7dd7ade94..f06bbae0a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,7 +62,6 @@ endif()
 
 option(BUILD_EXAMPLES "Build examples" TRUE)
 option(BUILD_FT "Build functional tests" TRUE)
-option(BUILD_REG_TESTS "Build regression tests" TRUE)
 option(BUILD_CONFIG "Build cmake configs" TRUE)
 option(ENABLE_MPI "Enable MPI support" TRUE)
 option(ENABLE_MPI_TESTS "Enable MPI tests support" TRUE)
@@ -116,11 +115,12 @@ set(CCL_INSTALL_DOC "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_DOCDIR}")
 set(CCL_INSTALL_BIN "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}")
 set(CCL_INSTALL_ENV "${CMAKE_INSTALL_PREFIX}/env")
 set(CCL_INSTALL_ETC "${CMAKE_INSTALL_PREFIX}/etc")
-set(CCL_INSTALL_LICENSE "${CMAKE_INSTALL_PREFIX}/licensing")
+set(CCL_INSTALL_SHARE "${CMAKE_INSTALL_PREFIX}/share")
+set(CCL_INSTALL_LICENSE "${CCL_INSTALL_SHARE}/doc/ccl/licensing")
 set(CCL_INSTALL_MODULE "${CMAKE_INSTALL_PREFIX}/modulefiles")
 set(CCL_INSTALL_EXAMPLES "${CMAKE_INSTALL_PREFIX}/examples")
 set(CCL_INSTALL_TESTS "${CMAKE_INSTALL_PREFIX}/tests")
-set(CCL_INSTALL_KERNELS "${CMAKE_INSTALL_PREFIX}/lib/kernels")
+set(CCL_INSTALL_KERNELS "${CMAKE_INSTALL_PREFIX}/lib/ccl/kernels")
 
 # setup dependency directories
 set(DEPS_DIR "${PROJECT_SOURCE_DIR}/deps")
@@ -302,7 +302,7 @@ file(GLOB spv_kernels "${PROJECT_SOURCE_DIR}/src/kernels/kernels.spv")
 endif()
 
 set(CCL_MAJOR_VERSION     "2021")
-set(CCL_MINOR_VERSION     "10")
+set(CCL_MINOR_VERSION     "11")
 set(CCL_UPDATE_VERSION    "0")
 set(CCL_PRODUCT_STATUS    "Gold")
 string(TIMESTAMP CCL_PRODUCT_BUILD_DATE "%Y-%m-%dT %H:%M:%SZ")
@@ -340,6 +340,8 @@ if (ENABLE_MPI_TESTS)
         endif()
         if (CCL_ENABLE_SYCL)
             add_subdirectory(examples/sycl)
+            #TODO: add cpu support
+            add_subdirectory(examples/pt2pt)
         endif()
     endif()
     if (BUILD_FT)
diff --git a/cmake/FindIntelSYCL_level_zero.cmake b/cmake/FindIntelSYCL_level_zero.cmake
index 08784bd89..7eb7d5a17 100644
--- a/cmake/FindIntelSYCL_level_zero.cmake
+++ b/cmake/FindIntelSYCL_level_zero.cmake
@@ -44,6 +44,7 @@ find_path(INTEL_SYCL_INCLUDE_DIRS
     PATHS
       ${sycl_root_hints}
       "${INTEL_SYCL_BINARY_DIR}/.."
+      "${INTEL_SYCL_BINARY_DIR}/../opt/compiler"
     PATH_SUFFIXES
         include
         include/sycl
diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index ca35065f2..e0cf00bba 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -12,6 +12,7 @@ function(set_lp_env)
     set(CLANG_BF16_AVX512BF_MIN_SUPPORTED "9.3.0")
 
     if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
+        OR (${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM")
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang"
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_BF16_MIN_SUPPORTED})
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU"
@@ -30,7 +31,8 @@ function(set_lp_env)
     string(REGEX MATCH "([0-9]+)\\.([0-9]+)" BINUTILS_VERSION ${BINUTILS_VERSION_RAW})
     message(STATUS "binutils version: " "${BINUTILS_VERSION}")
 
-    if ((${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
+    if (((${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
+        OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM")
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${ICC_BF16_AVX512BF_MIN_SUPPORTED})
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang"
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_BF16_AVX512BF_MIN_SUPPORTED})
@@ -46,7 +48,7 @@ function(set_lp_env)
     message(STATUS "BF16 AVX512BF compiler: ${CCL_BF16_AVX512BF_COMPILER}")
 
     if (CCL_BF16_COMPILER)
-        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
+        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM" OR  ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
             add_definitions(-DCCL_BF16_TARGET_ATTRIBUTES)
             set(CCL_BF16_TARGET_ATTRIBUTES ON)
         else()
@@ -66,6 +68,7 @@ function(set_lp_env)
     set(CLANG_FP16_MIN_SUPPORTED "9.0.0")
 
     if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
+        OR (${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM")
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang"
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_FP16_MIN_SUPPORTED})
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU"
@@ -79,7 +82,8 @@ function(set_lp_env)
     message(STATUS "FP16 compiler: ${CCL_FP16_COMPILER}")
 
     if (CCL_FP16_COMPILER)
-        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
+        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM"
+        OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
             add_definitions(-DCCL_FP16_TARGET_ATTRIBUTES)
             set(CCL_FP16_TARGET_ATTRIBUTES ON)
         else()
@@ -104,6 +108,7 @@ function(set_avx_env)
     set(CLANG_AVX_MIN_SUPPORTED "9.0.0")
 
     if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel"
+        OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM"
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "Clang"
             AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_AVX_MIN_SUPPORTED})
         OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU"
@@ -117,7 +122,7 @@ function(set_avx_env)
     message(STATUS "AVX compiler: ${CCL_AVX_COMPILER}")
 
     if (CCL_AVX_COMPILER)
-        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
+        if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
             add_definitions(-DCCL_AVX_TARGET_ATTRIBUTES)
             set(CCL_AVX_TARGET_ATTRIBUTES ON)
         else()
@@ -148,6 +153,10 @@ function(check_compiler_version)
         if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${CLANG_MIN_SUPPORTED})
             message(FATAL_ERROR "clang min supported version is ${CLANG_MIN_SUPPORTED}, current version ${CMAKE_C_COMPILER_VERSION}")
         endif()
+    elseif(${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM")
+        if(${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${ICC_MIN_SUPPORTED})
+            message(FATAL_ERROR "icc min supported version is ${ICC_MIN_SUPPORTED}, current version ${CMAKE_C_COMPILER_VERSION}")
+        endif()
     else()
         message(WARNING "Compilation with ${CMAKE_C_COMPILER_ID} was not tested, no warranty")
     endif()
diff --git a/cmake/setvars.sh.in b/cmake/setvars.sh.in
index 7c9225b35..4892b9952 100644
--- a/cmake/setvars.sh.in
+++ b/cmake/setvars.sh.in
@@ -104,5 +104,5 @@ then
 else
     PATH="${CCL_ROOT}/bin:${PATH}"; export PATH
 fi
-
-FI_PROVIDER_PATH="${CCL_ROOT}/@CMAKE_INSTALL_LIBDIR@/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH
+LD_LIBRARY_PATH=$(prepend_path "${I_MPI_ROOT}/opt/mpi/libfabric/lib" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH
+FI_PROVIDER_PATH="${I_MPI_ROOT}/opt/mpi/libfabric/lib/prov:/usr/lib64/libfabric"; export FI_PROVIDER_PATH
diff --git a/cmake/templates/oneCCLConfig.cmake.in b/cmake/templates/oneCCLConfig.cmake.in
index 0decd0c19..c13caf03d 100644
--- a/cmake/templates/oneCCLConfig.cmake.in
+++ b/cmake/templates/oneCCLConfig.cmake.in
@@ -17,22 +17,19 @@
 get_filename_component(_oneccl_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH)
 get_filename_component(_oneccl_root "${_oneccl_root}/../../../" ABSOLUTE)
 
-set(_oneccl_subdir "cpu_gpu_dpcpp")
-
-if (EXISTS "${CCL_CONFIGURATION}")
-    set(_oneccl_subdir "${CCL_CONFIGURATION}")
-endif()
-
-if (_oneccl_subdir EQUAL "cpu")
+if ("$ENV{CCL_CONFIGURATION}" STREQUAL "cpu")
     include(CheckCXXCompilerFlag)
     check_cxx_compiler_flag("-fsycl" _fsycl_option)
     if (_fsycl_option)
         message(STATUS "STATUS: -fsycl not supported for CCL_CONFIGURATION=cpu")
     endif()
-endif()
 
-get_filename_component(_oneccl_headers "${_oneccl_root}/include/${_oneccl_subdir}" ABSOLUTE)
-get_filename_component(_oneccl_lib "${_oneccl_root}/lib/${_oneccl_subdir}/libccl.so" ABSOLUTE)
+    get_filename_component(_oneccl_headers "${_oneccl_root}/include" ABSOLUTE)
+    get_filename_component(_oneccl_lib "${_oneccl_root}/lib/ccl/cpu/lib/libccl.so" ABSOLUTE)
+else()
+    get_filename_component(_oneccl_headers "${_oneccl_root}/include" ABSOLUTE)
+    get_filename_component(_oneccl_lib "${_oneccl_root}/lib/libccl.so" ABSOLUTE)
+endif()
 
 if (EXISTS "${_oneccl_headers}" AND EXISTS "${_oneccl_lib}")
     if (NOT TARGET oneCCL)
@@ -60,4 +57,3 @@ else()
     endif()
     set(oneCCL_FOUND FALSE)
 endif()
-
diff --git a/cmake/vars.sh.in b/cmake/vars.sh.in
index 13ffda8d0..bb761c68b 100644
--- a/cmake/vars.sh.in
+++ b/cmake/vars.sh.in
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+# shellcheck shell=sh
 
 get_script_path() (
     script="$1"
diff --git a/deps/hwloc/include/hwloc.h b/deps/hwloc/include/hwloc.h
index 18ea1dfa1..f58adec5e 100644
--- a/deps/hwloc/include/hwloc.h
+++ b/deps/hwloc/include/hwloc.h
@@ -77,6 +77,25 @@ extern "C" {
 #endif
 
 
+/** \defgroup hwlocality_api_error_reporting Error reporting in the API
+ * @{
+ * Most functions in the hwloc API return an integer value.
+ * Unless documentated differently, they return 0 on success
+ * and -1 on error.
+ * Functions that return a pointer type return \c NULL on error.
+ *
+ * \p errno will be set to a meaningful value whenever possible.
+ * This includes the usual \c EINVAL when invalid function parameters are passed
+ * or \c ENOMEM when an internal allocation fails.
+ * Some specific \c errno value are also used, for instance for binding
+ * errors as documented in \ref hwlocality_cpubinding.
+ *
+ * Some modules describe return values of their functions
+ * in their introduction, for instance in \ref hwlocality_bitmap.
+ * @}
+ */
+
+
 /** \defgroup hwlocality_api_version API version
  * @{
  */
@@ -98,6 +117,8 @@ extern "C" {
 /** \brief Indicate at runtime which hwloc API version was used at build time.
  *
  * Should be ::HWLOC_API_VERSION if running on the same version.
+ *
+ * \return the build-time version number.
  */
 HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
 
@@ -358,14 +379,17 @@ typedef enum hwloc_obj_osdev_type_e {
 /** \brief Compare the depth of two object types
  *
  * Types shouldn't be compared as they are, since newer ones may be added in
- * the future.  This function returns less than, equal to, or greater than zero
- * respectively if \p type1 objects usually include \p type2 objects, are the
- * same as \p type2 objects, or are included in \p type2 objects. If the types
- * can not be compared (because neither is usually contained in the other),
- * ::HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
- * be compared (usually, a system contains machines which contain nodes which
- * contain packages which contain caches, which contain cores, which contain
- * processors).
+ * the future.
+ *
+ * \return A negative integer if \p type1 objects usually include \p type2 objects.
+ * \return A positive integer if \p type1 objects are usually included in \p type2 objects.
+ * \return 0 if \p type1 and \p type2 objects are the same.
+ * \return ::HWLOC_TYPE_UNORDERED if objects cannot be compared
+ * (because neither is usually contained in the other).
+ *
+ * \note Object types containing CPUs can always be compared
+ * (usually, a machine contains packages, which contain caches,
+ *  which contain cores, which contain PUs).
  *
  * \note ::HWLOC_OBJ_PU will always be the deepest,
  * while ::HWLOC_OBJ_MACHINE is always the highest.
@@ -575,7 +599,7 @@ struct hwloc_obj {
                                           * \note Its value must not be changed, hwloc_bitmap_dup() must be used instead.
                                           */
 
-  struct hwloc_info_s *infos;		/**< \brief Array of stringified info type=name. */
+  struct hwloc_info_s *infos;		/**< \brief Array of info attributes (name and value strings). */
   unsigned infos_count;			/**< \brief Size of infos array. */
 
   /* misc */
@@ -667,7 +691,7 @@ union hwloc_obj_attr_u {
   } osdev;
 };
 
-/** \brief Object info
+/** \brief Object info attribute (name and value strings)
  *
  * \sa hwlocality_info_attr
  */
@@ -734,6 +758,8 @@ HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
  *
  * This is useful for keeping a backup while modifying a topology.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note Object userdata is not duplicated since hwloc does not know what it point to.
  * The objects of both old and new topologies will point to the same userdata.
  */
@@ -788,6 +814,8 @@ HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
  *
  * This is the depth of ::HWLOC_OBJ_PU objects plus one.
  *
+ * \return the depth of the object tree.
+ *
  * \note NUMA nodes, I/O and Misc objects are ignored when computing
  * the depth of the tree (they are placed on special levels).
  */
@@ -795,23 +823,26 @@ HWLOC_DECLSPEC int hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict to
 
 /** \brief Returns the depth of objects of type \p type.
  *
- * If no object of this type is present on the underlying architecture, or if
- * the OS doesn't provide this kind of information, the function returns
- * ::HWLOC_TYPE_DEPTH_UNKNOWN.
+ * \return The depth of objects of type \p type.
  *
- * If type is absent but a similar type is acceptable, see also
- * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
- *
- * If ::HWLOC_OBJ_GROUP is given, the function may return ::HWLOC_TYPE_DEPTH_MULTIPLE
- * if multiple levels of Groups exist.
- *
- * If a NUMA node, I/O or Misc object type is given, the function returns a virtual
- * value because these objects are stored in special levels that are not CPU-related.
+ * \return A negative virtual depth if a NUMA node, I/O or Misc object type is given.
+ * These objects are stored in special levels that are not CPU-related.
  * This virtual depth may be passed to other hwloc functions such as
  * hwloc_get_obj_by_depth() but it should not be considered as an actual
  * depth by the application. In particular, it should not be compared with
  * any other object depth or with the entire topology depth.
- * \sa hwloc_get_memory_parents_depth().
+ *
+ * \return ::HWLOC_TYPE_DEPTH_UNKNOWN
+ * if no object of this type is present on the underlying architecture,
+ * or if the OS doesn't provide this kind of information.
+ *
+ * \return ::HWLOC_TYPE_DEPTH_MULTIPLE if type ::HWLOC_OBJ_GROUP is given
+ * and multiple levels of Groups exist.
+ *
+ * \note If the type is absent but a similar type is acceptable, see also
+ * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
+ *
+ * \sa hwloc_get_memory_parents_depth() for managing the depth of memory objects.
  *
  * \sa hwloc_type_sscanf_as_depth() for returning the depth of objects
  * whose type is given as a string.
@@ -887,18 +918,23 @@ hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
  * \p depth should between 0 and hwloc_topology_get_depth()-1,
  * or a virtual depth such as ::HWLOC_TYPE_DEPTH_NUMANODE.
  *
+ * \return The type of objects at depth \p depth.
  * \return (hwloc_obj_type_t)-1 if depth \p depth does not exist.
  */
 HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
 
 /** \brief Returns the width of level at depth \p depth.
+ *
+ * \return The number of objects at topology depth \p depth.
+ * \return 0 if there are no objects at depth \p depth.
  */
 HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, int depth) __hwloc_attribute_pure;
 
 /** \brief Returns the width of level type \p type
  *
- * If no object for that type exists, 0 is returned.
- * If there are several levels with objects of that type, -1 is returned.
+ * \return The number of objects of type \p type.
+ * \return -1 if there are multiple levels with objects of that type, e.g. ::HWLOC_OBJ_GROUP.
+ * \return 0 if there are no objects at depth \p depth.
  */
 static __hwloc_inline int
 hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
@@ -906,34 +942,45 @@ hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hw
 /** \brief Returns the top-object of the topology-tree.
  *
  * Its type is ::HWLOC_OBJ_MACHINE.
+ *
+ * This function cannot return \c NULL.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
 
-/** \brief Returns the topology object at logical index \p idx from depth \p depth */
+/** \brief Returns the topology object at logical index \p idx from depth \p depth
+ *
+ * \return The object if it exists.
+ * \return \c NULL if there is no object with this index and depth.
+ */
 HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, int depth, unsigned idx) __hwloc_attribute_pure;
 
 /** \brief Returns the topology object at logical index \p idx with type \p type
  *
- * If no object for that type exists, \c NULL is returned.
- * If there are several levels with objects of that type (::HWLOC_OBJ_GROUP),
- * \c NULL is returned and the caller may fallback to hwloc_get_obj_by_depth().
+ * \return The object if it exists.
+ * \return \c NULL if there is no object with this index and type.
+ * \return \c NULL if there are multiple levels with objects of that type (e.g. ::HWLOC_OBJ_GROUP),
+ * the caller may fallback to hwloc_get_obj_by_depth().
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
 
 /** \brief Returns the next object at depth \p depth.
  *
- * If \p prev is \c NULL, return the first object at depth \p depth.
+ * \return The first object at depth \p depth if \p prev is \c NULL.
+ * \return The object after \p prev at depth \p depth if \p prev is not \c NULL.
+ * \return \c NULL if there is no such object.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_obj_by_depth (hwloc_topology_t topology, int depth, hwloc_obj_t prev);
 
 /** \brief Returns the next object of type \p type.
  *
- * If \p prev is \c NULL, return the first object at type \p type.  If
- * there are multiple or no depth for given type, return \c NULL and
- * let the caller fallback to hwloc_get_next_obj_by_depth().
+ * \return The first object of type \p type if \p prev is \c NULL.
+ * \return The object after \p prev of type \p type if \p prev is not \c NULL.
+ * \return \c NULL if there is no such object.
+ * \return \c NULL if there are multiple levels with objects of that type (e.g. ::HWLOC_OBJ_GROUP),
+ * the caller may fallback to hwloc_get_obj_by_depth().
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
@@ -954,6 +1001,8 @@ hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
  *
  * hwloc_obj_type_snprintf() may return a more precise output for a specific
  * object, but it requires the caller to provide the output buffer.
+ *
+ * \return A constant string containing the object type name or \c "Unknown".
  */
 HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
 
@@ -1049,23 +1098,26 @@ HWLOC_DECLSPEC int hwloc_type_sscanf_as_depth(const char *string,
 
 
 
-/** \defgroup hwlocality_info_attr Consulting and Adding Key-Value Info Attributes
+/** \defgroup hwlocality_info_attr Consulting and Adding Info Attributes
  *
  * @{
  */
 
-/** \brief Search the given key name in object infos and return the corresponding value.
+/** \brief Search the given name in object infos and return the corresponding value.
+ *
+ * If multiple info attributes match the given name, only the first one is returned.
  *
- * If multiple keys match the given name, only the first one is returned.
+ * \return A pointer to the value string if it exists.
+ * \return \c NULL if no such info attribute exists.
  *
- * \return \c NULL if no such key exists.
+ * \note The string should not be freed by the caller, it belongs to the hwloc library.
  */
 static __hwloc_inline const char *
 hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure;
 
-/** \brief Add the given info name and value pair to the given object.
+/** \brief Add the given name and value pair to the given object info attributes.
  *
- * The info is appended to the existing info array even if another key
+ * The info pair is appended to the existing info array even if another pair
  * with the same name already exists.
  *
  * The input strings are copied before being added in the object infos.
@@ -1073,10 +1125,10 @@ hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_
  * \return \c 0 on success, \c -1 on error.
  *
  * \note This function may be used to enforce object colors in the lstopo
- * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
+ * graphical output by adding "lstopoStyle" as a name and "Background=#rrggbb"
  * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
  *
- * \note If \p value contains some non-printable characters, they will
+ * \note If \p name or \p value contain some non-printable characters, they will
  * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
  */
 HWLOC_DECLSPEC int hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
@@ -1193,7 +1245,7 @@ typedef enum {
    * a problem for the application, but if it is, setting this flag
    * will make hwloc avoid using OS functions that would also bind
    * memory.  This will however reduce the support of CPU bindings,
-   * i.e. potentially return -1 with errno set to ENOSYS in some
+   * i.e. potentially return -1 with errno set to \c ENOSYS in some
    * cases.
    *
    * This flag is only meaningful when used with functions that set
@@ -1206,8 +1258,9 @@ typedef enum {
 
 /** \brief Bind current process or thread on CPUs given in physical bitmap \p set.
  *
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOSYS if the action is not supported.
+ * \return -1 with errno set to \c EXDEV if the binding cannot be enforced.
  */
 HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
 
@@ -1216,10 +1269,14 @@ HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpus
  * The CPU-set \p set (previously allocated by the caller)
  * is filled with the list of PUs which the process or
  * thread (according to \e flags) was last bound to.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
 
 /** \brief Bind a process \p pid on CPUs given in physical bitmap \p set.
+ *
+ * \return 0 on success, -1 on error.
  *
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
@@ -1238,6 +1295,8 @@ HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t
  * is filled with the list of PUs which the process
  * was last bound to.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  *
@@ -1251,6 +1310,8 @@ HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t
 
 #ifdef hwloc_thread_t
 /** \brief Bind a thread \p thread on CPUs given in physical bitmap \p set.
+ *
+ * \return 0 on success, -1 on error.
  *
  * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
@@ -1267,6 +1328,8 @@ HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thr
  * is filled with the list of PUs which the thread
  * was last bound to.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  *
@@ -1291,6 +1354,8 @@ HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thr
  * on which all threads are running), or only the current thread. If the
  * process is single-threaded, flags can be set to zero to let hwloc use
  * whichever method is available on the underlying OS.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
 
@@ -1305,6 +1370,8 @@ HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_
  * so this function may return something that is already
  * outdated.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  *
@@ -1343,7 +1410,7 @@ HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, h
  * (e.g., some systems only allow binding memory on a per-thread
  * basis, whereas other systems only allow binding memory for all
  * threads in a process).
- * \p errno will be set to EXDEV when the requested set can not be enforced
+ * \p errno will be set to \c EXDEV when the requested set can not be enforced
  * (e.g., some systems only allow binding memory to a single NUMA node).
  *
  * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
@@ -1417,6 +1484,12 @@ typedef enum {
   HWLOC_MEMBIND_FIRSTTOUCH =	1,
 
   /** \brief Allocate memory on the specified nodes.
+   *
+   * The actual behavior may slightly vary between operating systems,
+   * especially when (some of) the requested nodes are full.
+   * On Linux, by default, the MPOL_PREFERRED_MANY (or MPOL_PREFERRED) policy
+   * is used. However, if the hwloc strict flag is also given, the Linux
+   * MPOL_BIND policy is rather used.
    * \hideinitializer */
   HWLOC_MEMBIND_BIND =		2,
 
@@ -1492,7 +1565,7 @@ typedef enum {
    * could potentially affect CPU bindings.  Note, however, that using
    * NOCPUBIND may reduce hwloc's overall memory binding
    * support. Specifically: some of hwloc's memory binding functions
-   * may fail with errno set to ENOSYS when used with NOCPUBIND.
+   * may fail with errno set to \c ENOSYS when used with NOCPUBIND.
    * \hideinitializer
    */
   HWLOC_MEMBIND_NOCPUBIND =     (1<<4),
@@ -1521,8 +1594,9 @@ typedef enum {
  * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
  * Otherwise it's a cpuset.
  *
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOSYS if the action is not supported.
+ * \return -1 with errno set to \c EXDEV if the binding cannot be enforced.
  */
 HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
 
@@ -1551,7 +1625,7 @@ HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitm
  * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
  * is also specified.  In this case, hwloc will check the default
  * memory policies and nodesets for all threads in the process.  If
- * they are not identical, -1 is returned and errno is set to EXDEV.
+ * they are not identical, -1 is returned and errno is set to \c EXDEV.
  * If they are identical, the values are returned in \p set and \p
  * policy.
  *
@@ -1571,7 +1645,9 @@ HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_bitm
  * Otherwise it's a cpuset.
  *
  * If any other flags are specified, -1 is returned and errno is set
- * to EINVAL.
+ * to \c EINVAL.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
 
@@ -1581,8 +1657,9 @@ HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_bitmap_t s
  * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
  * Otherwise it's a cpuset.
  *
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOSYS if the action is not supported.
+ * \return -1 with errno set to \c EXDEV if the binding cannot be enforced.
  *
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
@@ -1614,7 +1691,7 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t
  * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
  * memory policies and nodesets for all threads in the specified
  * process.  If they are not identical, -1 is returned and errno is
- * set to EXDEV.  If they are identical, the values are returned in \p
+ * set to \c EXDEV.  If they are identical, the values are returned in \p
  * set and \p policy.
  *
  * Otherwise, \p set is set to the logical OR of all threads'
@@ -1626,7 +1703,9 @@ HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t
  * Otherwise it's a cpuset.
  *
  * If any other flags are specified, -1 is returned and errno is set
- * to EINVAL.
+ * to \c EINVAL.
+ *
+ * \return 0 on success, -1 on error.
  *
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
@@ -1639,9 +1718,9 @@ HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t
  * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
  * Otherwise it's a cpuset.
  *
- * \return 0 if \p len is 0.
- * \return -1 with errno set to ENOSYS if the action is not supported
- * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ * \return 0 on success or if \p len is 0.
+ * \return -1 with errno set to \c ENOSYS if the action is not supported.
+ * \return -1 with errno set to \c EXDEV if the binding cannot be enforced.
  */
 HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags);
 
@@ -1658,7 +1737,7 @@ HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void
  *
  * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
  * checked to see if they all have the same memory binding policy and
- * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * nodeset.  If they do not, -1 is returned and errno is set to \c EXDEV.
  * If they are identical across all pages, the set and policy are
  * returned in \p set and \p policy, respectively.
  *
@@ -1671,9 +1750,10 @@ HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void
  * Otherwise it's a cpuset.
  *
  * If any other flags are specified, -1 is returned and errno is set
- * to EINVAL.
+ * to \c EINVAL.
  *
- * If \p len is 0, -1 is returned and errno is set to EINVAL.
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if \p len is 0.
  */
 HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags);
 
@@ -1696,6 +1776,8 @@ HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void
  * considered a nodeset. Otherwise it's a cpuset.
  *
  * If \p len is 0, \p set is emptied.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_get_area_memlocation(hwloc_topology_t topology, const void *addr, size_t len, hwloc_bitmap_t set, int flags);
 
@@ -1704,17 +1786,20 @@ HWLOC_DECLSPEC int hwloc_get_area_memlocation(hwloc_topology_t topology, const v
  * This is equivalent to malloc(), except that it tries to allocate
  * page-aligned memory from the OS.
  *
+ * \return a pointer to the allocated area, or \c NULL on error.
+ *
  * \note The allocated memory should be freed with hwloc_free().
  */
 HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
 
 /** \brief Allocate some memory on NUMA memory nodes specified by \p set
  *
- * \return NULL with errno set to ENOSYS if the action is not supported
- * and ::HWLOC_MEMBIND_STRICT is given
- * \return NULL with errno set to EXDEV if the binding cannot be enforced
- * and ::HWLOC_MEMBIND_STRICT is given
- * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * \return a pointer to the allocated area.
+ * \return NULL with errno set to \c ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given.
+ * \return NULL with errno set to \c EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given.
+ * \return NULL with errno set to \c ENOMEM if the memory allocation failed
  * even before trying to bind.
  *
  * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
@@ -1735,12 +1820,16 @@ HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len,
  *
  * If ::HWLOC_MEMBIND_BYNODESET is specified, set is considered a nodeset.
  * Otherwise it's a cpuset.
+ *
+ * \return a pointer to the allocated area, or \c NULL on error.
  */
 static __hwloc_inline void *
 hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_bitmap_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
 
 /** \brief Free memory that was previously allocated by hwloc_alloc()
  * or hwloc_alloc_membind().
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len);
 
@@ -1749,6 +1838,9 @@ HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
 
 
 /** \defgroup hwlocality_setsource Changing the Source of Topology Discovery
+ *
+ * These functions must be called between hwloc_topology_init() and hwloc_topology_load().
+ * Otherwise, they will return -1 with errno set to \c EBUSY.
  *
  * If none of the functions below is called, the default is to detect all the objects
  * of the machine that the caller is allowed to access.
@@ -1777,8 +1869,10 @@ HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
  * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
  * and \p HANDLE on native Windows platforms.
  *
- * \note -1 is returned and errno is set to ENOSYS on platforms that do not
+ * \note -1 is returned and errno is set to \c ENOSYS on platforms that do not
  * support this feature.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
 
@@ -1796,13 +1890,16 @@ HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topo
  *
  * If \p description was properly parsed and describes a valid topology
  * configuration, this function returns 0.
- * Otherwise -1 is returned and errno is set to EINVAL.
+ * Otherwise -1 is returned and errno is set to \c EINVAL.
  *
  * Note that this function does not actually load topology
  * information; it just tells hwloc where to load it from.  You'll
  * still need to invoke hwloc_topology_load() to actually load the
  * topology information.
  *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if the description was invalid.
+ *
  * \note For convenience, this backend provides empty binding hooks which just
  * return success.
  *
@@ -1824,7 +1921,8 @@ HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restric
  * still need to invoke hwloc_topology_load() to actually load the
  * topology information.
  *
- * \return -1 with errno set to EINVAL on failure to read the XML file.
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL on failure to read the XML file.
  *
  * \note See also hwloc_topology_set_userdata_import_callback()
  * for importing application-specific object userdata.
@@ -1852,7 +1950,8 @@ HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topo
  * still need to invoke hwloc_topology_load() to actually load the
  * topology information.
  *
- * \return -1 with errno set to EINVAL on failure to read the XML buffer.
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL on failure to read the XML buffer.
  *
  * \note See also hwloc_topology_set_userdata_import_callback()
  * for importing application-specific object userdata.
@@ -1890,6 +1989,9 @@ enum hwloc_topology_components_flag_e {
  * This may be used to avoid expensive parts of the discovery process.
  * For instance, CUDA-specific discovery may be expensive and unneeded
  * while generic I/O discovery could still be useful.
+ *
+ * \return 0 on success.
+ * \return -1 on error, for instance if flags are invalid.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_components(hwloc_topology_t __hwloc_restrict topology, unsigned long flags, const char * __hwloc_restrict name);
 
@@ -2092,6 +2194,9 @@ enum hwloc_topology_flags_e {
  * By default, no flags are set (\c 0).
  *
  * The flags set in a topology may be retrieved with hwloc_topology_get_flags().
+ *
+ * \return 0 on success.
+ * \return -1 on error, for instance if flags are invalid.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
 
@@ -2103,6 +2208,8 @@ HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned
  * no flags are set (\c 0 is returned).
  *
  * \return the flags previously set with hwloc_topology_set_flags().
+ *
+ * \note This function may also be called after hwloc_topology_load().
  */
 HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);
 
@@ -2112,6 +2219,8 @@ HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology
  * running this program.
  * \return 0 instead (for instance if using another file-system root,
  * a XML topology file, or a synthetic topology).
+ *
+ * \note This function may also be called after hwloc_topology_load().
  */
 HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t  __hwloc_restrict topology) __hwloc_attribute_pure;
 
@@ -2239,6 +2348,14 @@ struct hwloc_topology_support {
  * to report the supported features of the original remote machine
  * instead. If it was successfully imported, \p imported_support
  * will be set in the struct hwloc_topology_misc_support array.
+ *
+ * \return A pointer to a support structure.
+ *
+ * \note The function cannot return \c NULL.
+ * \note The returned pointer should not be freed, it belongs to the hwloc library.
+ *
+ * \note This function may be called before or after hwloc_topology_load()
+ * but the support structure only contains valid information after.
  */
 HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
 
@@ -2298,32 +2415,44 @@ enum hwloc_type_filter_e {
 };
 
 /** \brief Set the filtering for the given object type.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e filter);
 
 /** \brief Get the current filtering for the given object type.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_get_type_filter(hwloc_topology_t topology, hwloc_obj_type_t type, enum hwloc_type_filter_e *filter);
 
 /** \brief Set the filtering for all object types.
  *
  * If some types do not support this filtering, they are silently ignored.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_all_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
 
 /** \brief Set the filtering for all CPU cache object types.
  *
  * Memory-side caches are not involved since they are not CPU caches.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_cache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
 
 /** \brief Set the filtering for all CPU instruction cache object types.
  *
  * Memory-side caches are not involved since they are not CPU caches.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_icache_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
 
 /** \brief Set the filtering for all I/O object types.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_set_io_types_filter(hwloc_topology_t topology, enum hwloc_type_filter_e filter);
 
@@ -2343,6 +2472,9 @@ HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const
  *
  * Retrieve the application-given private data pointer that was
  * previously set with hwloc_topology_set_userdata().
+ *
+ * \return A pointer to the private-data if any.
+ * \return \c NULL if no private-data was previoulsy set.
  */
 HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
 
@@ -2395,21 +2527,32 @@ enum hwloc_restrict_flags_e {
  * are not included (or partially included) in the CPU set \p set.
  * All objects CPU and node sets are restricted accordingly.
  *
+ * By default, \p set is a CPU set. It means that the set of PUs in
+ * the topology is restricted. Once some PUs got removed, their parents
+ * may also get removed recursively if they became child-less.
+ *
  * If ::HWLOC_RESTRICT_FLAG_BYNODESET is passed in \p flags,
  * \p set is considered a nodeset instead of a CPU set.
+ * It means that the set of NUMA nodes in the topology is restricted
+ * (instead of PUs). Once some NUMA nodes got removed, their parents
+ * may also get removed recursively if they became child-less.
  *
  * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
  *
+ * \note Restricting the topology removes some locality information,
+ * hence the remaining objects may get reordered (including PUs and NUMA nodes),
+ * and their logical indexes may change.
+ *
  * \note This call may not be reverted by restricting back to a larger
  * set. Once dropped during restriction, objects may not be brought
  * back, except by loading another topology with hwloc_topology_load().
  *
  * \return 0 on success.
  *
- * \return -1 with errno set to EINVAL if the input set is invalid.
+ * \return -1 with errno set to \c EINVAL if the input set is invalid.
  * The topology is not modified in this case.
  *
- * \return -1 with errno set to ENOMEM on failure to allocate internal data.
+ * \return -1 with errno set to \c ENOMEM on failure to allocate internal data.
  * The topology is reinitialized in this case. It should be either
  * destroyed with hwloc_topology_destroy() or configured and loaded again.
  */
@@ -2449,6 +2592,8 @@ enum hwloc_allow_flags_e {
  *
  * \p flags must be set to one flag among ::hwloc_allow_flags_e.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note Removing objects from a topology should rather be performed with
  * hwloc_topology_restrict().
  */
@@ -2483,6 +2628,9 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t to
  *
  * The caller should (at least) initialize its sets before inserting
  * the object in the topology. See hwloc_topology_insert_group_object().
+ *
+ * \return The allocated object on success.
+ * \return \c NULL on error.
  */
 HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
 
@@ -2506,7 +2654,7 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t to
  * The \p subtype object attribute may be defined (to a dynamically
  * allocated string) to display something else than "Group" as the
  * type name for this object in lstopo.
- * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * Custom name-value info pairs may be added with hwloc_obj_add_info() after
  * insertion.
  *
  * The group \p dont_merge attribute may be set to \c 1 to prevent
@@ -2519,6 +2667,10 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t to
  * as \c 0xffffffff to tell hwloc that this new Group should always
  * be discarded in favor of any existing Group with the same locality.
  *
+ * \note Inserting a group adds some locality information to the topology,
+ * hence the existing objects may get reordered (including PUs and NUMA nodes),
+ * and their logical indexes may change.
+ *
  * \return The inserted object if it was properly inserted.
  *
  * \return An existing object if the Group was merged or discarded
@@ -2542,6 +2694,9 @@ HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t t
  * This function is convenient between hwloc_topology_alloc_group_object()
  * and hwloc_topology_insert_group_object(). It builds the sets of the new Group
  * that will be inserted as a new intermediate parent of several objects.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOMEM if some internal reallocation failed.
  */
 HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src);
 
@@ -2558,6 +2713,9 @@ HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src
  * attributes, etc.
  *
  * See also \ref threadsafety
+ *
+ * \return 0 on success.
+ * \return -1 on error, for instance if some internal reallocation failed.
  */
 HWLOC_DECLSPEC int hwloc_topology_refresh(hwloc_topology_t topology);
 
diff --git a/deps/hwloc/include/hwloc/autogen/config.h b/deps/hwloc/include/hwloc/autogen/config.h
index b2de227dd..3d4238a87 100644
--- a/deps/hwloc/include/hwloc/autogen/config.h
+++ b/deps/hwloc/include/hwloc/autogen/config.h
@@ -12,10 +12,10 @@
 #ifndef HWLOC_CONFIG_H
 #define HWLOC_CONFIG_H
 
-#define HWLOC_VERSION "2.9.0rc2-git"
+#define HWLOC_VERSION "2.9.3rc2-git"
 #define HWLOC_VERSION_MAJOR 2
 #define HWLOC_VERSION_MINOR 9
-#define HWLOC_VERSION_RELEASE 0
+#define HWLOC_VERSION_RELEASE 3
 #define HWLOC_VERSION_GREEK "rc2"
 
 /* #undef HWLOC_PCI_COMPONENT_BUILTIN */
diff --git a/deps/hwloc/include/hwloc/bitmap.h b/deps/hwloc/include/hwloc/bitmap.h
index cd118b387..6b56bcb9b 100644
--- a/deps/hwloc/include/hwloc/bitmap.h
+++ b/deps/hwloc/include/hwloc/bitmap.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2022 Inria.  All rights reserved.
+ * Copyright © 2009-2023 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -50,9 +50,10 @@ extern "C" {
  * hwloc_bitmap_free(set);
  * \endcode
  *
- * \note Most functions below return an int that may be negative in case of
- * error. The usual error case would be an internal failure to realloc/extend
+ * \note Most functions below return 0 on success and -1 on error.
+ * The usual error case would be an internal failure to realloc/extend
  * the storage of the bitmap (\p errno would be set to \c ENOMEM).
+ * See also \ref hwlocality_api_error_reporting.
  *
  * \note Several examples of using the bitmap API are available under the
  * doc/examples/ directory in the source tree.
@@ -83,7 +84,13 @@ typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t;
  */
 HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc;
 
-/** \brief Allocate a new full bitmap. */
+/** \brief Allocate a new full bitmap.
+ *
+ * \returns A valid bitmap or \c NULL.
+ *
+ * The bitmap should be freed by a corresponding call to
+ * hwloc_bitmap_free().
+ */
 HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc;
 
 /** \brief Free bitmap \p bitmap.
@@ -119,11 +126,13 @@ HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buf
 
 /** \brief Stringify a bitmap into a newly allocated string.
  *
- * \return -1 on error.
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
 
 /** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
 
@@ -144,11 +153,13 @@ HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_
 
 /** \brief Stringify a bitmap into a newly allocated list string.
  *
- * \return -1 on error.
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
 
 /** \brief Parse a list string and stores it in bitmap \p bitmap.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
 
@@ -168,11 +179,13 @@ HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, si
 
 /** \brief Stringify a bitmap into a newly allocated taskset-specific string.
  *
- * \return -1 on error.
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
 
 /** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
 
@@ -279,6 +292,7 @@ HWLOC_DECLSPEC int hwloc_bitmap_to_ulongs(hwloc_const_bitmap_t bitmap, unsigned
  * When called on the output of hwloc_topology_get_topology_cpuset(),
  * the returned number is large enough for all cpusets of the topology.
  *
+ * \return the number of unsigned longs required.
  * \return -1 if \p bitmap is infinite.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_nr_ulongs(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
@@ -305,21 +319,23 @@ HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attr
 
 /** \brief Compute the first index (least significant bit) in bitmap \p bitmap
  *
- * \return -1 if no index is set in \p bitmap.
+ * \return the first index set in \p bitmap.
+ * \return -1 if \p bitmap is empty.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
 /** \brief Compute the next index in bitmap \p bitmap which is after index \p prev
  *
- * If \p prev is -1, the first index is returned.
- *
+ * \return the first index set in \p bitmap if \p prev is \c -1.
+ * \return the next index set in \p bitmap if \p prev is not \c -1.
  * \return -1 if no index with higher index is set in \p bitmap.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
 
 /** \brief Compute the last index (most significant bit) in bitmap \p bitmap
  *
- * \return -1 if no index is set in \p bitmap, or if \p bitmap is infinitely set.
+ * \return the last index set in \p bitmap.
+ * \return -1 if \p bitmap is empty, or if \p bitmap is infinitely set.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
@@ -327,28 +343,29 @@ HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attrib
  * indexes that are in the bitmap).
  *
  * \return the number of indexes that are in the bitmap.
- *
  * \return -1 if \p bitmap is infinitely set.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
 /** \brief Compute the first unset index (least significant bit) in bitmap \p bitmap
  *
- * \return -1 if no index is unset in \p bitmap.
+ * \return the first unset index in \p bitmap.
+ * \return -1 if \p bitmap is full.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_first_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
 /** \brief Compute the next unset index in bitmap \p bitmap which is after index \p prev
  *
- * If \p prev is -1, the first unset index is returned.
- *
+ * \return the first index unset in \p bitmap if \p prev is \c -1.
+ * \return the next index unset in \p bitmap if \p prev is not \c -1.
  * \return -1 if no index with higher index is unset in \p bitmap.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_next_unset(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
 
 /** \brief Compute the last unset index (most significant bit) in bitmap \p bitmap
  *
- * \return -1 if no index is unset in \p bitmap, or if \p bitmap is infinitely set.
+ * \return the last index unset in \p bitmap.
+ * \return -1 if \p bitmap is full, or if \p bitmap is not infinitely set.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_last_unset(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
 
@@ -428,6 +445,8 @@ HWLOC_DECLSPEC int hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bi
 /** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects.
  *
  * \return 1 if bitmaps intersect, 0 otherwise.
+ *
+ * \note The empty bitmap does not intersect any other bitmap.
  */
 HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
 
diff --git a/deps/hwloc/include/hwloc/diff.h b/deps/hwloc/include/hwloc/diff.h
index 0ad0486be..f7e6fb1e7 100644
--- a/deps/hwloc/include/hwloc/diff.h
+++ b/deps/hwloc/include/hwloc/diff.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2013-2020 Inria.  All rights reserved.
+ * Copyright © 2013-2023 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -222,6 +222,8 @@ enum hwloc_topology_diff_apply_flags_e {
 HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
 
 /** \brief Destroy a list of topology differences.
+ *
+ * \return 0.
  */
 HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff);
 
@@ -233,6 +235,8 @@ HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_diff_t diff);
  * This identifier is usually the name of the other XML file
  * that contains the reference topology.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note the pointer returned in refname should later be freed
  * by the caller.
  */
@@ -246,6 +250,8 @@ HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(const char *xmlpath, hwloc_topol
  * This identifier is usually the name of the other XML file
  * that contains the reference topology.
  * This attribute is given back when reading the diff from XML.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
 
@@ -257,6 +263,8 @@ HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_diff_t diff, co
  * This identifier is usually the name of the other XML file
  * that contains the reference topology.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note the pointer returned in refname should later be freed
  * by the caller.
   */
@@ -274,6 +282,8 @@ HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(const char *xmlbuffer, int
  * The returned buffer ends with a \0 that is included in the returned
  * length.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
  */
 HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
diff --git a/deps/hwloc/include/hwloc/distances.h b/deps/hwloc/include/hwloc/distances.h
index effa8663e..71cca4b5f 100644
--- a/deps/hwloc/include/hwloc/distances.h
+++ b/deps/hwloc/include/hwloc/distances.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2010-2022 Inria.  All rights reserved.
+ * Copyright © 2010-2023 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -131,6 +131,8 @@ enum hwloc_distances_kind_e {
  *
  * Each distance matrix returned in the \p distances array should be released
  * by the caller using hwloc_distances_release().
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int
 hwloc_distances_get(hwloc_topology_t topology,
@@ -140,6 +142,8 @@ hwloc_distances_get(hwloc_topology_t topology,
 /** \brief Retrieve distance matrices for object at a specific depth in the topology.
  *
  * Identical to hwloc_distances_get() with the additional \p depth filter.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int
 hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
@@ -149,6 +153,8 @@ hwloc_distances_get_by_depth(hwloc_topology_t topology, int depth,
 /** \brief Retrieve distance matrices for object of a specific type.
  *
  * Identical to hwloc_distances_get() with the additional \p type filter.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int
 hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
@@ -162,6 +168,8 @@ hwloc_distances_get_by_type(hwloc_topology_t topology, hwloc_obj_type_t type,
  * The name of the most common structure is "NUMALatency".
  * Others include "XGMIBandwidth", "XGMIHops", "XeLinkBandwidth",
  * and "NVLinkBandwidth".
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int
 hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
@@ -171,7 +179,12 @@ hwloc_distances_get_by_name(hwloc_topology_t topology, const char *name,
 /** \brief Get a description of what a distances structure contains.
  *
  * For instance "NUMALatency" for hardware-provided NUMA distances (ACPI SLIT),
- * or NULL if unknown.
+ * or \c NULL if unknown.
+ *
+ * \return the constant string with the name of the distance structure.
+ *
+ * \note The returned name should not be freed by the caller,
+ * it belongs to the hwloc library.
  */
 HWLOC_DECLSPEC const char *
 hwloc_distances_get_name(hwloc_topology_t topology, struct hwloc_distances_s *distances);
@@ -252,6 +265,8 @@ enum hwloc_distances_transform_e {
  *
  * \p flags must be \c 0 for now.
  *
+ * \return 0 on success, -1 on error for instance if flags are invalid.
+ *
  * \note Objects in distances array \p objs may be directly modified
  * in place without using hwloc_distances_transform().
  * One may use hwloc_get_obj_with_same_locality() to easily convert
@@ -272,6 +287,7 @@ HWLOC_DECLSPEC int hwloc_distances_transform(hwloc_topology_t topology, struct h
 
 /** \brief Find the index of an object in a distances structure.
  *
+ * \return the index of the object in the distances structure if any.
  * \return -1 if object \p obj is not involved in structure \p distances.
  */
 static __hwloc_inline int
@@ -289,6 +305,7 @@ hwloc_distances_obj_index(struct hwloc_distances_s *distances, hwloc_obj_t obj)
  * The distance from \p obj1 to \p obj2 is stored in the value pointed by
  * \p value1to2 and reciprocally.
  *
+ * \return 0 on success.
  * \return -1 if object \p obj1 or \p obj2 is not involved in structure \p distances.
  */
 static __hwloc_inline int
@@ -374,8 +391,8 @@ hwloc_distances_add_create(hwloc_topology_t topology,
  *
  * \p flags must be \c 0 for now.
  *
- * \return \c 0 on success.
- * \return \c -1 on error.
+ * \return 0 on success.
+ * \return -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_distances_add_values(hwloc_topology_t topology,
                                               hwloc_distances_add_handle_t handle,
@@ -411,8 +428,8 @@ enum hwloc_distances_add_flag_e {
  *
  * On error, the temporary distances structure and its content are destroyed.
  *
- * \return \c 0 on success.
- * \return \c -1 on error.
+ * \return 0 on success.
+ * \return -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_distances_add_commit(hwloc_topology_t topology,
                                               hwloc_distances_add_handle_t handle,
@@ -433,18 +450,24 @@ HWLOC_DECLSPEC int hwloc_distances_add_commit(hwloc_topology_t topology,
  *
  * If these distances were used to group objects, these additional
  * Group objects are not removed from the topology.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_distances_remove(hwloc_topology_t topology);
 
 /** \brief Remove distance matrices for objects at a specific depth in the topology.
  *
  * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_distances_remove_by_depth(hwloc_topology_t topology, int depth);
 
 /** \brief Remove distance matrices for objects of a specific type in the topology.
  *
  * Identical to hwloc_distances_remove() but only applies to one level of the topology.
+ *
+ * \return 0 on success, -1 on error.
  */
 static __hwloc_inline int
 hwloc_distances_remove_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
@@ -458,6 +481,8 @@ hwloc_distances_remove_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
 /** \brief Release and remove the given distance matrice from the topology.
  *
  * This function includes a call to hwloc_distances_release().
+ *
+ * \return 0 on success, -1 on error.
  */
 HWLOC_DECLSPEC int hwloc_distances_release_remove(hwloc_topology_t topology, struct hwloc_distances_s *distances);
 
diff --git a/deps/hwloc/include/hwloc/export.h b/deps/hwloc/include/hwloc/export.h
index b178b77e5..2ce5ab188 100644
--- a/deps/hwloc/include/hwloc/export.h
+++ b/deps/hwloc/include/hwloc/export.h
@@ -55,7 +55,7 @@ enum hwloc_topology_export_xml_flags_e {
  *
  * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
  *
- * \return -1 if a failure occured.
+ * \return 0 on success, or -1 on error.
  *
  * \note See also hwloc_topology_set_userdata_export_callback()
  * for exporting application-specific object userdata.
@@ -91,7 +91,7 @@ HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const ch
  *
  * \p flags is a OR'ed set of ::hwloc_topology_export_xml_flags_e.
  *
- * \return -1 if a failure occured.
+ * \return 0 on success, or -1 on error.
  *
  * \note See also hwloc_topology_set_userdata_export_callback()
  * for exporting application-specific object userdata.
@@ -145,13 +145,15 @@ HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t
  * that were given to the export callback.
  *
  * Only printable characters may be exported to XML string attributes.
- * If a non-printable character is passed in \p name or \p buffer,
- * the function returns -1 with errno set to EINVAL.
  *
  * If exporting binary data, the application should first encode into
  * printable characters only (or use hwloc_export_obj_userdata_base64()).
  * It should also take care of portability issues if the export may
  * be reimported on a different architecture.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if a non-printable character is
+ * passed in \p name or \b buffer.
  */
 HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
 
@@ -165,8 +167,14 @@ HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t to
  * This function may only be called from within the export() callback passed
  * to hwloc_topology_set_userdata_export_callback().
  *
+ * The name must be made of printable characters for export to XML string attributes.
+ *
  * The function does not take care of portability issues if the export
  * may be reimported on a different architecture.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if a non-printable character is
+ * passed in \p name.
  */
 HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
 
diff --git a/deps/hwloc/include/hwloc/helper.h b/deps/hwloc/include/hwloc/helper.h
index 44994211f..acd9ef782 100644
--- a/deps/hwloc/include/hwloc/helper.h
+++ b/deps/hwloc/include/hwloc/helper.h
@@ -1,6 +1,6 @@
 /*
  * Copyright © 2009 CNRS
- * Copyright © 2009-2022 Inria.  All rights reserved.
+ * Copyright © 2009-2023 Inria.  All rights reserved.
  * Copyright © 2009-2012 Université Bordeaux
  * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
  * See COPYING in top-level directory.
@@ -33,6 +33,7 @@ extern "C" {
 /** \brief Get the first largest object included in the given cpuset \p set.
  *
  * \return the first object that is included in \p set and whose parent is not.
+ * \return \c NULL if no such object exists.
  *
  * This is convenient for iterating over all largest objects within a CPU set
  * by doing a loop getting the first largest object and clearing its CPU set
@@ -65,15 +66,19 @@ hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const
 /** \brief Get the set of largest objects covering exactly a given cpuset \p set
  *
  * \return the number of objects returned in \p objs.
+ * \return -1 if no set of objects may cover that cpuset.
  */
 HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set,
 						 hwloc_obj_t * __hwloc_restrict objs, int max);
 
 /** \brief Return the next object at depth \p depth included in CPU set \p set.
  *
- * If \p prev is \c NULL, return the first object at depth \p depth
- * included in \p set.  The next invokation should pass the previous
- * return value in \p prev so as to obtain the next object in \p set.
+ * The next invokation should pass the previous return value in \p prev
+ * so as to obtain the next object in \p set.
+ *
+ * \return the first object at depth \p depth included in \p set if \p prev is \c NULL.
+ * \return the next object at depth \p depth included in \p set if \p prev is not \c NULL.
+ * \return \c NULL if there is no next object.
  *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
@@ -95,9 +100,15 @@ hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_cons
 
 /** \brief Return the next object of type \p type included in CPU set \p set.
  *
- * If there are multiple or no depth for given type, return \c NULL
- * and let the caller fallback to
- * hwloc_get_next_obj_inside_cpuset_by_depth().
+ * The next invokation should pass the previous return value in \p prev
+ * so as to obtain the next object in \p set.
+ *
+ * \return the first object of type \p type included in \p set if \p prev is \c NULL.
+ * \return the next object of type \p type included in \p set if \p prev is not \c NULL.
+ * \return \c NULL if there is no next object.
+ * \return \c NULL if there is no depth for the given type.
+ * \return \c NULL if there are multiple depths for the given type,
+ * the caller should fallback to hwloc_get_next_obj_inside_cpuset_by_depth().
  *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
@@ -116,6 +127,8 @@ hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const
 }
 
 /** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \return the object if any, \c NULL otherwise.
  *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
@@ -147,9 +160,11 @@ hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpu
 
 /** \brief Return the \p idx -th object of type \p type included in CPU set \p set.
  *
- * If there are multiple or no depth for given type, return \c NULL
- * and let the caller fallback to
- * hwloc_get_obj_inside_cpuset_by_depth().
+ * \return the object if any.
+ * \return \c NULL if there is no such object.
+ * \return \c NULL if there is no depth for given type.
+ * \return \c NULL if there are multiple depths for given type,
+ * the caller should fallback to hwloc_get_obj_inside_cpuset_by_depth().
  *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
@@ -171,6 +186,9 @@ hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpus
 }
 
 /** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \return the number of objects.
+ * \return 0 if the depth is invalid.
  *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
@@ -199,9 +217,10 @@ hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_
 
 /** \brief Return the number of objects of type \p type included in CPU set \p set.
  *
- * If no object for that type exists inside CPU set \p set, 0 is
- * returned.  If there are several levels with objects of that type
- * inside CPU set \p set, -1 is returned.
+ * \return the number of objects.
+ * \return 0 if there are no objects of that type in the topology.
+ * \return -1 if there are multiple levels of objects of that type,
+ * the caller should fallback to hwloc_get_nbobjs_inside_cpuset_by_depth().
  *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
@@ -232,6 +251,9 @@ hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_c
  * Otherwise, this is similar to a logical index within the part of the topology
  * defined by CPU set \p set.
  *
+ * \return the logical index among the objects included in the set if any.
+ * \return -1 if the object is not included in the set.
+ *
  * \note Objects with empty CPU sets are ignored
  * (otherwise they would be considered included in any given set).
  *
@@ -264,6 +286,7 @@ hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_u
 
 /** \brief Get the child covering at least CPU set \p set.
  *
+ * \return the child that covers the set entirely.
  * \return \c NULL if no child matches or if \p set is empty.
  *
  * \note This function cannot work if parent does not have a CPU set (I/O or Misc objects).
@@ -289,6 +312,7 @@ hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unu
 
 /** \brief Get the lowest object covering at least CPU set \p set
  *
+ * \return the lowest object covering the set entirely.
  * \return \c NULL if no object matches or if \p set is empty.
  */
 static __hwloc_inline hwloc_obj_t
@@ -309,11 +333,14 @@ hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t s
 
 /** \brief Iterate through same-depth objects covering at least CPU set \p set
  *
- * If object \p prev is \c NULL, return the first object at depth \p
- * depth covering at least part of CPU set \p set.  The next
- * invokation should pass the previous return value in \p prev so as
+ * The next invokation should pass the previous return value in \p prev so as
  * to obtain the next object covering at least another part of \p set.
  *
+ * \return the first object at depth \p depth covering at least part of CPU set \p set
+ * if object \p prev is \c NULL.
+ * \return the next one if \p prev is not \c NULL.
+ * \return \c NULL if there is no next object.
+ *
  * \note This function cannot work if objects at the given depth do
  * not have CPU sets (I/O or Misc objects).
  */
@@ -331,15 +358,16 @@ hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_con
 
 /** \brief Iterate through same-type objects covering at least CPU set \p set
  *
- * If object \p prev is \c NULL, return the first object of type \p
- * type covering at least part of CPU set \p set.  The next invokation
- * should pass the previous return value in \p prev so as to obtain
- * the next object of type \p type covering at least another part of
- * \p set.
+ * The next invokation should pass the previous return value in \p prev so as to obtain
+ * the next object of type \p type covering at least another part of \p set.
  *
- * If there are no or multiple depths for type \p type, \c NULL is returned.
- * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth()
- * for each depth.
+ * \return the first object of type \p type covering at least part of CPU set \p set
+ * if object \p prev is \c NULL.
+ * \return the next one if \p prev is not \c NULL.
+ * \return \c NULL if there is no next object.
+ * \return \c NULL if there is no depth for the given type.
+ * \return \c NULL if there are multiple depths for the given type,
+ * the caller should fallback to hwloc_get_next_obj_covering_cpuset_by_depth().
  *
  * \note This function cannot work if objects of the given type do
  * not have CPU sets (I/O or Misc objects).
@@ -368,6 +396,9 @@ hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_cons
  */
 
 /** \brief Returns the ancestor object of \p obj at depth \p depth.
+ *
+ * \return the ancestor if any.
+ * \return \c NULL if no such ancestor exists.
  *
  * \note \p depth should not be the depth of PU or NUMA objects
  * since they are ancestors of no objects (except Misc or I/O).
@@ -388,6 +419,12 @@ hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unu
 }
 
 /** \brief Returns the ancestor object of \p obj with type \p type.
+ *
+ * \return the ancestor if any.
+ * \return \c NULL if no such ancestor exists.
+ *
+ * \note if multiple matching ancestors exist (e.g. multiple levels of ::HWLOC_OBJ_GROUP)
+ * the lowest one is returned.
  *
  * \note \p type should not be ::HWLOC_OBJ_PU or ::HWLOC_OBJ_NUMANODE
  * since these objects are ancestors of no objects (except Misc or I/O).
@@ -405,7 +442,12 @@ hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unus
   return ancestor;
 }
 
-/** \brief Returns the common parent object to objects \p obj1 and \p obj2 */
+/** \brief Returns the common parent object to objects \p obj1 and \p obj2.
+ *
+ * \return the common ancestor.
+ *
+ * \note This function cannot return \c NULL.
+ */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
 static __hwloc_inline hwloc_obj_t
@@ -430,6 +472,8 @@ hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unuse
 }
 
 /** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
+ *
+ * \return 1 is the object is in the subtree, 0 otherwise.
  *
  * \note This function cannot work if \p obj and \p subtree_root objects do
  * not have CPU sets (I/O or Misc objects).
@@ -448,9 +492,9 @@ hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwl
  * then among the memory children list, then among the I/O
  * children list, then among the Misc children list.
  *
- * If \p prev is \c NULL, return the first child.
- *
- * Return \c NULL when there is no next child.
+ * \return the first child if \p prev is \c NULL.
+ * \return the next child if \p prev is not \c NULL.
+ * \return \c NULL when there is no next child.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev)
@@ -462,7 +506,7 @@ hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_
       state = 3;
     else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
       state = 2;
-    else if (prev->type == HWLOC_OBJ_NUMANODE)
+    else if (prev->type == HWLOC_OBJ_NUMANODE || prev->type == HWLOC_OBJ_MEMCACHE)
       state = 1;
     obj = prev->next_sibling;
   } else {
@@ -578,17 +622,19 @@ hwloc_obj_type_is_icache(hwloc_obj_type_t type);
  * corresponding type such as ::HWLOC_OBJ_L1ICACHE, except that it may
  * also return a Unified cache when looking for an instruction cache.
  *
- * If no cache level matches, ::HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ * \return the depth of the unique matching unified cache level is returned
+ * if \p cachetype is ::HWLOC_OBJ_CACHE_UNIFIED.
+ *
+ * \return the depth of either a matching cache level or a unified cache level
+ * if \p cachetype is ::HWLOC_OBJ_CACHE_DATA or ::HWLOC_OBJ_CACHE_INSTRUCTION.
  *
- * If \p cachetype is ::HWLOC_OBJ_CACHE_UNIFIED, the depth of the
- * unique matching unified cache level is returned.
+ * \return the depth of the matching level
+ * if \p cachetype is \c -1 but only one level matches.
  *
- * If \p cachetype is ::HWLOC_OBJ_CACHE_DATA or ::HWLOC_OBJ_CACHE_INSTRUCTION,
- * either a matching cache, or a unified cache is returned.
+ * \return ::HWLOC_TYPE_DEPTH_MULTIPLE
+ * if \p cachetype is \c -1 but multiple levels match.
  *
- * If \p cachetype is \c -1, it is ignored and multiple levels may
- * match. The function returns either the depth of a uniquely matching
- * level or ::HWLOC_TYPE_DEPTH_MULTIPLE.
+ * \return ::HWLOC_TYPE_DEPTH_UNKNOWN if no cache level matches.
  */
 static __hwloc_inline int
 hwloc_get_cache_type_depth (hwloc_topology_t topology,
@@ -622,7 +668,7 @@ hwloc_get_cache_type_depth (hwloc_topology_t topology,
 
 /** \brief Get the first data (or unified) cache covering a cpuset \p set
  *
- * \return \c NULL if no cache matches.
+ * \return a covering cache, or \c NULL if no cache matches.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
@@ -640,7 +686,8 @@ hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t
 
 /** \brief Get the first data (or unified) cache shared between an object and somebody else.
  *
- * \return \c NULL if no cache matches or if an invalid object is given.
+ * \return a shared cache.
+ * \return \c NULL if no cache matches or if an invalid object is given (e.g. I/O object).
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure;
@@ -684,6 +731,8 @@ hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute
  * If \p which is larger than the number of PUs in a core there were originally set in \p cpuset,
  * no PU is kept for that core.
  *
+ * \return 0.
+ *
  * \note PUs that are not below a Core object are ignored
  * (for instance if the topology does not contain any Core object).
  * None of them is removed from \p cpuset.
@@ -698,6 +747,8 @@ HWLOC_DECLSPEC int hwloc_bitmap_singlify_per_core(hwloc_topology_t topology, hwl
  * one may iterate over the bits of the resulting CPU set with
  * hwloc_bitmap_foreach_begin(), and find the corresponding PUs
  * with this function.
+ *
+ * \return the PU object, or \c NULL if none matches.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
@@ -719,6 +770,8 @@ hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
  * one may iterate over the bits of the resulting nodeset with
  * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
  * with this function.
+ *
+ * \return the NUMA node object, or \c NULL if none matches.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
@@ -756,6 +809,8 @@ HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc
  * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2
  * is 3, return the fourth core object below the third package.
  *
+ * \return a matching object if any, \c NULL otherwise.
+ *
  * \note This function requires these objects to have a CPU set.
  */
 static __hwloc_inline hwloc_obj_t
@@ -789,6 +844,8 @@ hwloc_get_obj_below_by_type (hwloc_topology_t topology,
  * and idxv contains 0, 1 and 2, return the third core object below
  * the second package below the first NUMA node.
  *
+ * \return a matching object if any, \c NULL otherwise.
+ *
  * \note This function requires all these objects and the root object
  * to have a CPU set.
  */
@@ -885,6 +942,8 @@ enum hwloc_distrib_flags_e {
  *
  * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
  *
+ * \return 0 on success, -1 on error.
+ *
  * \note This function requires the \p roots objects to have a CPU set.
  */
 static __hwloc_inline int
@@ -961,6 +1020,8 @@ hwloc_distrib(hwloc_topology_t topology,
  *
  * \return the complete CPU set of processors of the system.
  *
+ * \note This function cannot return \c NULL.
+ *
  * \note The returned cpuset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
  *
@@ -975,6 +1036,8 @@ hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_
  * provides topology information. This is equivalent to the cpuset of the
  * system object.
  *
+ * \note This function cannot return \c NULL.
+ *
  * \note The returned cpuset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
  *
@@ -987,6 +1050,8 @@ hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_
  *
  * \return the CPU set of allowed processors of the system.
  *
+ * \note This function cannot return \c NULL.
+ *
  * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set,
  * this is identical to hwloc_topology_get_topology_cpuset(), which means
  * all PUs are allowed.
@@ -1006,6 +1071,8 @@ hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_p
  *
  * \return the complete node set of memory of the system.
  *
+ * \note This function cannot return \c NULL.
+ *
  * \note The returned nodeset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
  *
@@ -1020,6 +1087,8 @@ hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute
  * provides topology information. This is equivalent to the nodeset of the
  * system object.
  *
+ * \note This function cannot return \c NULL.
+ *
  * \note The returned nodeset is not newly allocated and should thus not be
  * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
  *
@@ -1032,6 +1101,8 @@ hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute
  *
  * \return the node set of allowed memory of the system.
  *
+ * \note This function cannot return \c NULL.
+ *
  * \note If the topology flag ::HWLOC_TOPOLOGY_FLAG_INCLUDE_DISALLOWED was not set,
  * this is identical to hwloc_topology_get_topology_nodeset(), which means
  * all NUMA nodes are allowed.
@@ -1066,6 +1137,9 @@ hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_
  *
  * Hence the entire topology CPU set is converted into the set of all nodes
  * that have some local CPUs.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOMEM on internal reallocation failure.
  */
 static __hwloc_inline int
 hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
@@ -1090,6 +1164,9 @@ hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset,
  *
  * Hence the entire topology node set is converted into the set of all CPUs
  * that have some local NUMA nodes.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOMEM on internal reallocation failure.
  */
 static __hwloc_inline int
 hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
@@ -1122,6 +1199,10 @@ hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwl
  * because it has non-NULL CPU and node sets
  * and because its locality is the same as \p ioobj.
  *
+ * \return a non-I/O object.
+ *
+ * \note This function cannot return \c NULL.
+ *
  * \note The resulting object is usually a normal object but it could also
  * be a memory object (e.g. NUMA node) in future platforms if I/O objects
  * ever get attached to memory instead of CPUs.
@@ -1140,6 +1221,8 @@ hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused
 /** \brief Get the next PCI device in the system.
  *
  * \return the first PCI device if \p prev is \c NULL.
+ * \return the next PCI device if \p prev is not \c NULL.
+ * \return \c NULL if there is no next PCI device.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
@@ -1149,6 +1232,8 @@ hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
 
 /** \brief Find the PCI device object matching the PCI bus id
  * given domain, bus device and function PCI bus id.
+ *
+ * \return a matching PCI device object if any, \c NULL otherwise.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
@@ -1167,6 +1252,8 @@ hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
 
 /** \brief Find the PCI device object matching the PCI bus id
  * given as a string xxxx:yy:zz.t or yy:zz.t.
+ *
+ * \return a matching PCI device object if any, \c NULL otherwise.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
@@ -1186,6 +1273,8 @@ hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
 /** \brief Get the next OS device in the system.
  *
  * \return the first OS device if \p prev is \c NULL.
+ * \return the next OS device if \p prev is not \c NULL.
+ * \return \c NULL if there is no next OS device.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
@@ -1196,6 +1285,8 @@ hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
 /** \brief Get the next bridge in the system.
  *
  * \return the first bridge if \p prev is \c NULL.
+ * \return the next bridge if \p prev is not \c NULL.
+ * \return \c NULL if there is no next bridge.
  */
 static __hwloc_inline hwloc_obj_t
 hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
@@ -1204,6 +1295,8 @@ hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
 }
 
 /* \brief Checks whether a given bridge covers a given PCI bus.
+ *
+ * \return 1 if it covers, 0 if not.
  */
 static __hwloc_inline int
 hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
diff --git a/deps/hwloc/include/hwloc/memattrs.h b/deps/hwloc/include/hwloc/memattrs.h
index acf4da537..6d2cff9be 100644
--- a/deps/hwloc/include/hwloc/memattrs.h
+++ b/deps/hwloc/include/hwloc/memattrs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2019-2022 Inria.  All rights reserved.
+ * Copyright © 2019-2023 Inria.  All rights reserved.
  * See COPYING in top-level directory.
  */
 
@@ -178,6 +178,9 @@ enum hwloc_memattr_id_e {
 typedef unsigned hwloc_memattr_id_t;
 
 /** \brief Return the identifier of the memory attribute with the given name.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if no such attribute exists.
  */
 HWLOC_DECLSPEC int
 hwloc_memattr_get_by_name(hwloc_topology_t topology,
@@ -247,6 +250,8 @@ enum hwloc_local_numanode_flag_e {
  * or the number of nodes that would have been stored if there were
  * enough room.
  *
+ * \return 0 on success or -1 on error.
+ *
  * \note Some of these NUMA nodes may not have any memory attribute
  * values and hence not be reported as actual targets in other functions.
  *
@@ -276,6 +281,10 @@ hwloc_get_local_numanode_objs(hwloc_topology_t topology,
  *
  * \p flags must be \c 0 for now.
  *
+ * \return 0 on success.
+ * \return -1 on error, for instance with errno set to \c EINVAL if flags
+ * are invalid or no such attribute exists.
+ *
  * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
  * when refering to accesses performed by CPU cores.
  * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
@@ -307,7 +316,10 @@ hwloc_memattr_get_value(hwloc_topology_t topology,
  *
  * \p flags must be \c 0 for now.
  *
- * If there are no matching targets, \c -1 is returned with \p errno set to \c ENOENT;
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOENT if there are no matching targets.
+ * \return -1 with errno set to \c EINVAL if flags are invalid,
+ * or no such attribute exists.
  *
  * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
  * when refering to accesses performed by CPU cores.
@@ -323,10 +335,6 @@ hwloc_memattr_get_best_target(hwloc_topology_t topology,
                               hwloc_obj_t *best_target, hwloc_uint64_t *value);
 
 /** \brief Return the best initiator for the given attribute and target NUMA node.
- *
- * If the attribute does not relate to a specific initiator
- * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
- * \c -1 is returned and \p errno is set to \c EINVAL.
  *
  * If \p value is non \c NULL, the corresponding value is returned there.
  *
@@ -342,7 +350,10 @@ hwloc_memattr_get_best_target(hwloc_topology_t topology,
  *
  * \p flags must be \c 0 for now.
  *
- * If there are no matching initiators, \c -1 is returned with \p errno set to \c ENOENT;
+ * \return 0 on success.
+ * \return -1 with errno set to \c ENOENT if there are no matching initiators.
+ * \return -1 with errno set to \c EINVAL if the attribute does not relate to a specific initiator
+ * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR).
  */
 HWLOC_DECLSPEC int
 hwloc_memattr_get_best_initiator(hwloc_topology_t topology,
@@ -359,6 +370,9 @@ hwloc_memattr_get_best_initiator(hwloc_topology_t topology,
  */
 
 /** \brief Return the name of a memory attribute.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if the attribute does not exist.
  */
 HWLOC_DECLSPEC int
 hwloc_memattr_get_name(hwloc_topology_t topology,
@@ -368,6 +382,9 @@ hwloc_memattr_get_name(hwloc_topology_t topology,
 /** \brief Return the flags of the given attribute.
  *
  * Flags are a OR'ed set of ::hwloc_memattr_flag_e.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EINVAL if the attribute does not exist.
  */
 HWLOC_DECLSPEC int
 hwloc_memattr_get_flags(hwloc_topology_t topology,
@@ -397,6 +414,9 @@ enum hwloc_memattr_flag_e {
  * Add a specific memory attribute that is not defined in ::hwloc_memattr_id_e.
  * Flags are a OR'ed set of ::hwloc_memattr_flag_e. It must contain at least
  * one of ::HWLOC_MEMATTR_FLAG_HIGHER_FIRST or ::HWLOC_MEMATTR_FLAG_LOWER_FIRST.
+ *
+ * \return 0 on success.
+ * \return -1 with errno set to \c EBUSY if another attribute already uses this name.
  */
 HWLOC_DECLSPEC int
 hwloc_memattr_register(hwloc_topology_t topology,
@@ -421,6 +441,8 @@ hwloc_memattr_register(hwloc_topology_t topology,
  * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
  * but users may for instance use it to provide custom information about
  * host memory accesses performed by GPUs.
+ *
+ * \return 0 on success or -1 on error.
  */
 HWLOC_DECLSPEC int
 hwloc_memattr_set_value(hwloc_topology_t topology,
@@ -460,6 +482,8 @@ hwloc_memattr_set_value(hwloc_topology_t topology,
  * NUMA nodes with hwloc_get_local_numanode_objs() and then look at their attribute
  * values.
  *
+ * \return 0 on success or -1 on error.
+ *
  * \note The initiator \p initiator should be of type ::HWLOC_LOCATION_TYPE_CPUSET
  * when referring to accesses performed by CPU cores.
  * ::HWLOC_LOCATION_TYPE_OBJECT is currently unused internally by hwloc,
@@ -497,6 +521,8 @@ hwloc_memattr_get_targets(hwloc_topology_t topology,
  * (it does not have the flag ::HWLOC_MEMATTR_FLAG_NEED_INITIATOR),
  * no initiator is returned.
  *
+ * \return 0 on success or -1 on error.
+ *
  * \note This function is meant for tools and debugging (listing internal information)
  * rather than for application queries. Applications should rather select useful
  * NUMA nodes with hwloc_get_local_numanode_objs() and then look at their attribute
diff --git a/deps/hwloc/lib/libhwloc.a b/deps/hwloc/lib/libhwloc.a
index f2632a9e7..b2eb53c6e 100644
Binary files a/deps/hwloc/lib/libhwloc.a and b/deps/hwloc/lib/libhwloc.a differ
diff --git a/deps/mpi/bin/hydra_bstrap_proxy b/deps/mpi/bin/hydra_bstrap_proxy
index c7bb4525b..b99341f56 100755
Binary files a/deps/mpi/bin/hydra_bstrap_proxy and b/deps/mpi/bin/hydra_bstrap_proxy differ
diff --git a/deps/mpi/bin/hydra_nameserver b/deps/mpi/bin/hydra_nameserver
index 58bc0bba5..3eb0844b5 100755
Binary files a/deps/mpi/bin/hydra_nameserver and b/deps/mpi/bin/hydra_nameserver differ
diff --git a/deps/mpi/bin/hydra_pmi_proxy b/deps/mpi/bin/hydra_pmi_proxy
index 7e77bb4da..595fa9cf4 100755
Binary files a/deps/mpi/bin/hydra_pmi_proxy and b/deps/mpi/bin/hydra_pmi_proxy differ
diff --git a/deps/mpi/bin/mpicc b/deps/mpi/bin/mpicc
index c0af92fc0..56586a998 100755
--- a/deps/mpi/bin/mpicc
+++ b/deps/mpi/bin/mpicc
@@ -14,6 +14,7 @@
 #
 
 default_compiler_name="gcc"
+user_set_compiler=0
 
 #------------------------------------------------------------------------------ 
 # Print mini-help if started without parameters
@@ -35,6 +36,7 @@ for arg in "$@" ; do
     case $arg in 
         -cc=*)
         compiler_name=`echo A$arg | sed -e 's/A-cc=//g'`
+        user_set_compiler=1
         ;;
     esac
 done
@@ -47,24 +49,34 @@ if [ $# -eq 1 -a "$1" = "-v" ] ; then
     opt_args="-nolinkage"
 fi
 
-if [ x"$opt_args" == x"" ]; then
-    case "${compiler_short_name}" in
-    icc|icx)    "$dir"/mpiicc -cc=$compiler_name "$@" ;;
-    cc|*gcc*|clang*) "$dir"/mpigcc -cc=$compiler_name "$@" ;;
-    mpicc)  "$dir"/mpigcc "$@" ;;    
-    *)  
-            echo "Error: unsupported compiler name '$compiler_name'."
-            echo "Check -cc=<compiler_name> command line option and I_MPI_CC='$I_MPI_CC' and MPICH_CC='$MPICH_CC' variables."; 
-            exit 1 ;;
-    esac
+if [ $user_set_compiler -eq 0 ]; then
+    # default compiler
+    if [ x"$opt_args" == x"" ]; then
+        "$dir"/mpigcc -cc=$compiler_name "$@"
+    else
+        "$dir"/mpigcc -cc=$compiler_name "$@" $opt_args
+    fi
 else
-    case "${compiler_short_name}" in
-    icc|icx)    "$dir"/mpiicc -cc=$compiler_name "$@" $opt_args ;;
-    cc|*gcc*|clang*) "$dir"/mpigcc -cc=$compiler_name "$@" $opt_args ;;
-    mpicc)  "$dir"/mpigcc "$@" $opt_args ;;    
-    *)  
-            echo "Error: unsupported compiler name '$compiler_name'."
-            echo "Check -cc=<compiler_name> command line option and I_MPI_CC='$I_MPI_CC' and MPICH_CC='$MPICH_CC' variables."; 
-            exit 1 ;;
-    esac
+    # don't need to duplicate -cc since user already provided the option
+    if [ x"$opt_args" == x"" ]; then
+        case "${compiler_short_name}" in
+        icc|icx)         "$dir"/mpiicx "$@" ;;
+        cc|*gcc*|clang*) "$dir"/mpigcc "$@" ;;
+        mpicc)           "$dir"/mpigcc "$@" ;;
+        *)
+                echo "Error: unsupported compiler name '$compiler_name'."
+                echo "Check -cc=<compiler_name> command line option and I_MPI_CC='$I_MPI_CC' and MPICH_CC='$MPICH_CC' variables."; 
+                exit 1 ;;
+        esac
+    else
+        case "${compiler_short_name}" in
+        icc|icx)         "$dir"/mpiicx "$@" $opt_args ;;
+        cc|*gcc*|clang*) "$dir"/mpigcc "$@" $opt_args ;;
+        mpicc)           "$dir"/mpigcc "$@" $opt_args ;;
+        *)
+                echo "Error: unsupported compiler name '$compiler_name'."
+                echo "Check -cc=<compiler_name> command line option and I_MPI_CC='$I_MPI_CC' and MPICH_CC='$MPICH_CC' variables."; 
+                exit 1 ;;
+        esac
+    fi
 fi
diff --git a/deps/mpi/bin/mpicxx b/deps/mpi/bin/mpicxx
index 08d8d6a76..971af30cb 100755
--- a/deps/mpi/bin/mpicxx
+++ b/deps/mpi/bin/mpicxx
@@ -14,6 +14,7 @@
 #
 
 default_compiler_name="g++"
+user_set_compiler=0
 
 #------------------------------------------------------------------------------ 
 # Print mini-help if started without parameters
@@ -35,6 +36,7 @@ for arg in "$@" ; do
     case $arg in 
         -cxx=*)
         compiler_name=`echo A$arg | sed -e 's/A-cxx=//g'`
+        user_set_compiler=1
         ;;
     esac
 done
@@ -47,24 +49,34 @@ if [ $# -eq 1 -a "$1" = "-v" ] ; then
     opt_args="-nolinkage"
 fi
 
-if [ x"$opt_args" == x"" ]; then
-    case "${compiler_short_name}" in
-    icc|icpc|icpx|dpcpp)   "$dir"/mpiicpc -cxx=$compiler_name "$@" ;;
-    *g++*)        "$dir"/mpigxx -cxx=$compiler_name "$@" ;;
-    mpicxx)     "$dir"/mpigxx "$@" ;;
-    *)  
-            echo "Error: unsupported compiler name '$compiler_name'."
-            echo "Check -cxx=<compiler_name> command line option and I_MPI_CXX='$I_MPI_CXX' and MPICH_CXX='$MPICH_CXX' variables."; 
-            exit 1 ;;
-    esac
+if [ $user_set_compiler -eq 0 ]; then
+    # default compiler
+    if [ x"$opt_args" == x"" ]; then
+        "$dir"/mpigxx -cxx=$compiler_name "$@"
+    else
+        "$dir"/mpigxx -cxx=$compiler_name "$@" $opt_args
+    fi
 else
-    case "${compiler_short_name}" in
-    icc|icpc|icpx|dpcpp)   "$dir"/mpiicpc -cxx=$compiler_name "$@" $opt_args ;;
-    *g++*)        "$dir"/mpigxx -cxx=$compiler_name "$@" $opt_args ;;
-    mpicxx)     "$dir"/mpigxx "$@" $opt_args ;;
-    *)  
-            echo "Error: unsupported compiler name '$compiler_name'."
-            echo "Check -cxx=<compiler_name> command line option and I_MPI_CXX='$I_MPI_CXX' and MPICH_CXX='$MPICH_CXX' variables."; 
-            exit 1 ;;
-    esac
+    # don't need to duplicate -cc since user already provided the option
+    if [ x"$opt_args" == x"" ]; then
+        case "${compiler_short_name}" in
+        icc|icpc|icpx|dpcpp) "$dir"/mpiicpx "$@" ;;
+        *g++*)               "$dir"/mpigxx "$@" ;;
+        mpicxx)              "$dir"/mpigxx "$@" ;;
+        *)
+                echo "Error: unsupported compiler name '$compiler_name'."
+                echo "Check -cxx=<compiler_name> command line option and I_MPI_CXX='$I_MPI_CXX' and MPICH_CXX='$MPICH_CXX' variables."; 
+                exit 1 ;;
+        esac
+    else
+        case "${compiler_short_name}" in
+        icc|icpc|icpx|dpcpp) "$dir"/mpiicpx "$@" $opt_args ;;
+        *g++*)               "$dir"/mpigxx "$@" $opt_args ;;
+        mpicxx)              "$dir"/mpigxx "$@" $opt_args ;;
+        *)
+                echo "Error: unsupported compiler name '$compiler_name'."
+                echo "Check -cxx=<compiler_name> command line option and I_MPI_CXX='$I_MPI_CXX' and MPICH_CXX='$MPICH_CXX' variables."; 
+                exit 1 ;;
+        esac
+    fi
 fi
diff --git a/deps/mpi/bin/mpiexec b/deps/mpi/bin/mpiexec
index ad9fbbe8d..607c67dba 100755
Binary files a/deps/mpi/bin/mpiexec and b/deps/mpi/bin/mpiexec differ
diff --git a/deps/mpi/bin/mpiexec.hydra b/deps/mpi/bin/mpiexec.hydra
index ad9fbbe8d..607c67dba 100755
Binary files a/deps/mpi/bin/mpiexec.hydra and b/deps/mpi/bin/mpiexec.hydra differ
diff --git a/deps/mpi/bin/mpigcc b/deps/mpi/bin/mpigcc
index ac09ef4d7..17290ae1b 100755
--- a/deps/mpi/bin/mpigcc
+++ b/deps/mpi/bin/mpigcc
@@ -94,10 +94,10 @@
 #
 # Directory locations: Fixed for any MPI implementation.
 # Set from the directory arguments to configure (e.g., --prefix=/usr/local)
-prefix=I_MPI_SUBSTITUTE_INSTALLDIR
+prefix=""
 # The environment variable I_MPI_ROOT may be used to override installation folder path
-if [ -n "$I_MPI_ROOT" ] ; then
-    prefix=$I_MPI_ROOT;
+if [ -n "${I_MPI_ROOT}" ] ; then
+    prefix="${I_MPI_ROOT}";
 fi
 
 exec_prefix=__EXEC_PREFIX_TO_BE_FILLED_AT_INSTALL_TIME__
@@ -105,6 +105,15 @@ sysconfdir=${prefix}/etc
 includedir=${prefix}/include
 libdir=${prefix}/lib
 
+if [ ! -f "${prefix}/lib/mpi/debug/libmpi.so" ]; then
+    release_lib_dir="/release"
+    debug_lib_dir="/debug"
+else
+    sysconfdir=${prefix}/opt/mpi/etc
+    release_lib_dir=""
+    debug_lib_dir="/mpi/debug"
+fi
+MPILIBDIR=${release_lib_dir}
 # The environment variable I_MPI_COMPILER_CONFIG_DIR may be used to override
 # folder where *.conf files are placed
 if [ -n "$I_MPI_COMPILER_CONFIG_DIR" ] ; then
@@ -117,7 +126,7 @@ CC="gcc"
 MPICH_VERSION="3.4a2"
 CFLAGS=""
 CPPFLAGS=""
-MPIVERSION="2021.10"
+MPIVERSION="2021.11"
 MPILIBNAME="mpi"                           
 
 
@@ -412,7 +421,7 @@ for arg in "$@" ; do
     addarg=no
     ;;
     -g)
-    MPILIBDIR="/release"
+    MPILIBDIR=${release_lib_dir}
     ;;
     -static_log)
     static_log=yes
@@ -510,37 +519,25 @@ fi
 
 if [ -n "$mpilib_override" ] ; then
     case "$mpilib_override" in
-       opt )
-           MPILIBDIR="/release"
-           ;;
-       opt_mt )
-           MPILIBDIR="/release"
-           MPILIBDIR_MT="mt"
-           ;;
-       dbg )
-           MPILIBDIR="/debug"
-           ;;
-       dbg_mt )
-           MPILIBDIR="/debug"
-           MPILIBDIR_MT="mt"
-           ;;
-       * )
-           echo "Warning: incorrect library version specified. Automatically selected library will be used."
-           ;;
+    opt ) 
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    opt_mt )
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    dbg )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    dbg_mt )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    * )
+        echo "Warning: incorrect library version specified. Automatically selected library will be used."
+        ;;
     esac
 fi
-
 # -----------------------------------------------------------------------
-case "$MPILIBDIR" in
-    release | /release | debug | /debug)
-    if [ ! -z "$MPILIBDIR_MT" ]; then
-        MPILIBDIR=${MPILIBDIR}_${MPILIBDIR_MT}
-    fi
-    ;;
-    "" )
-    MPILIBDIR=/release
-    ;;
-esac
+
 if [ "$static_mpi" = yes ] ; then
     if [ "x$fortran_binding" = "x" ]; then
         mpilibs="${libdir}/libmpifort.a ${libdir}${MPILIBDIR}/lib${MPILIBNAME}.a"
diff --git a/deps/mpi/bin/mpigxx b/deps/mpi/bin/mpigxx
index 3708ae264..36504441f 100755
--- a/deps/mpi/bin/mpigxx
+++ b/deps/mpi/bin/mpigxx
@@ -93,10 +93,10 @@
 # Set the default values of all variables.
 #
 # Directory locations: Fixed for any MPI implementation
-prefix=I_MPI_SUBSTITUTE_INSTALLDIR
+prefix=""
 # The environment variable I_MPI_ROOT may be used to override installation folder path
-if [ -n "$I_MPI_ROOT" ] ; then
-    prefix=$I_MPI_ROOT;
+if [ -n "${I_MPI_ROOT}" ] ; then
+    prefix="${I_MPI_ROOT}";
 fi
 
 exec_prefix=__EXEC_PREFIX_TO_BE_FILLED_AT_INSTALL_TIME__
@@ -104,6 +104,15 @@ sysconfdir=${prefix}/etc
 includedir=${prefix}/include
 libdir=${prefix}/lib
 
+if [ ! -f "${prefix}/lib/mpi/debug/libmpi.so" ]; then
+    release_lib_dir="/release"
+    debug_lib_dir="/debug"
+else
+    sysconfdir=${prefix}/opt/mpi/etc
+    release_lib_dir=""
+    debug_lib_dir="/mpi/debug"
+fi
+MPILIBDIR=${release_lib_dir}
 # The environment variable I_MPI_COMPILER_CONFIG_DIR may be used to override
 # folder where *.conf files are placed
 if [ -n "$I_MPI_COMPILER_CONFIG_DIR" ] ; then
@@ -114,7 +123,7 @@ fi
 CXX="g++"
 MPICH_VERSION="3.4a2"
 CXXFLAGS=""
-MPIVERSION="2021.10"
+MPIVERSION="2021.11"
 MPILIBNAME="mpi"
 MPICXXLIBNAME="mpicxx"
 
@@ -414,7 +423,7 @@ for arg in "$@" ; do
     addarg=no
     ;;
     -g)
-    MPILIBDIR="/release"
+    MPILIBDIR=${release_lib_dir}
     ;;
     -static_log)
     static_log=yes
@@ -512,37 +521,23 @@ fi
 
 if [ -n "$mpilib_override" ] ; then
     case "$mpilib_override" in
-       opt )
-           MPILIBDIR="/release"
-           ;;
-       opt_mt )
-           MPILIBDIR="/release"
-           MPILIBDIR_MT="mt"
-           ;;
-       dbg )
-           MPILIBDIR="/debug"
-           ;;
-       dbg_mt )
-           MPILIBDIR="/debug"
-           MPILIBDIR_MT="mt"
-           ;;
-       * )
-           echo "Warning: incorrect library version specified. Automatically selected library will be used."
-           ;;
+    opt ) 
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    opt_mt )
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    dbg )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    dbg_mt )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    * )
+        echo "Warning: incorrect library version specified. Automatically selected library will be used."
+        ;;
     esac
 fi
-
-# -----------------------------------------------------------------------
-case "$MPILIBDIR" in
-    release | /release | debug | /debug)
-    if [ ! -z "$MPILIBDIR_MT" ]; then
-        MPILIBDIR=${MPILIBDIR}_${MPILIBDIR_MT}
-    fi
-    ;;
-    "" )
-    MPILIBDIR=/release
-    ;;
-esac
 # -----------------------------------------------------------------------
 if [ "$static_mpi" = yes ] ; then
     if [ "x$fortran_binding" = "x" ]; then
diff --git a/deps/mpi/bin/mpiicc b/deps/mpi/bin/mpiicc
index 40e9da71a..b9d18745e 100755
--- a/deps/mpi/bin/mpiicc
+++ b/deps/mpi/bin/mpiicc
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # Copyright Intel Corporation.
 # 
@@ -11,549 +11,32 @@
 # This software and the related documents are provided as is, with no express
 # or implied warranties, other than those that are expressly stated in the
 # License.
-#.
-# (C) 2006 by Argonne National Laboratory.
-# 
-# 				  COPYRIGHT
-# 
-# The following is a notice of limited availability of the code, and disclaimer
-# which must be included in the prologue of the code and in all source listings
-# of the code.
-# 
-# Copyright Notice
-# 1998--2020, Argonne National Laboratory
-# 
-# Permission is hereby granted to use, reproduce, prepare derivative works, and
-# to redistribute to others.  This software was authored by:
-# 
-# Mathematics and Computer Science Division
-# Argonne National Laboratory, Argonne IL 60439
-# 
-# (and)
-# 
-# Department of Computer Science
-# University of Illinois at Urbana-Champaign
-# 
-# 
-# 			      GOVERNMENT LICENSE
-# 
-# Portions of this material resulted from work developed under a U.S.
-# Government Contract and are subject to the following license: the Government
-# is granted for itself and others acting on its behalf a paid-up, nonexclusive,
-# irrevocable worldwide license in this computer software to reproduce, prepare
-# derivative works, and perform publicly and display publicly.
-# 
-# 				  DISCLAIMER
-# 
-# This computer code material was prepared, in part, as an account of work
-# sponsored by an agency of the United States Government.  Neither the United
-# States, nor the University of Chicago, nor any of their employees, makes any
-# warranty express or implied, or assumes any legal liability or responsibility
-# for the accuracy, completeness, or usefulness of any information, apparatus,
-# product, or process disclosed, or represents that its use would not infringe
-# privately owned rights.
-# 
-# 			   EXTERNAL CONTRIBUTIONS
-# 
-# Portions of this code have been contributed under the above license by:
-# 
-#  * Intel Corporation
-#  * Cray
-#  * IBM Corporation
-#  * Microsoft Corporation
-#  * Mellanox Technologies Ltd.
-#  * DataDirect Networks.
-#  * Oak Ridge National Laboratory
-#  * Sun Microsystems, Lustre group
-#  * Dolphin Interconnect Solutions Inc.
-#  * Institut Polytechnique de Bordeaux
-#
-#     .
-#
-# mpicc
-# Simple script to compile and/or link MPI programs.
-# This script knows the default flags and libraries, and can handle
-# alternative C compilers and the associated flags and libraries.
-# The important terms are:
-#    includedir, libdir - Directories containing an *installed* mpich2
-#    prefix, execprefix - Often used to define includedir and libdir
-#    CC                 - C compiler
-#    WRAPPER_CFLAGS        - Any special flags needed to compile
-#    WRAPPER_LDFLAGS       - Any special flags needed to link
-#    MPILIBNAME         - Name of the MPI library
-#    MPI_OTHERLIBS      - Other libraries needed in order to link
-#
-# We assume that (a) the C compiler can both compile and link programs
 #
-# Handling of command-line options:
-#   This is a little tricky because some options may contain blanks.
-#
-# Special issues with shared libraries - todo
-#
-# --------------------------------------------------------------------------
 
-# Set the default values of all variables.
-#
-# Directory locations: Fixed for any MPI implementation.
-# Set from the directory arguments to configure (e.g., --prefix=/usr/local)
-prefix=I_MPI_SUBSTITUTE_INSTALLDIR
-# The environment variable I_MPI_ROOT may be used to override installation folder path
-if [ -n "${I_MPI_ROOT}" ] ; then
-    prefix="${I_MPI_ROOT}";
-fi
+COMPILER="icc"
+TARGET_WRAPPER="mpiicx"
+user_set_compiler=0
 
-PLATFORM=""
-sysconfdir=${prefix}/etc
-# The environment variable I_MPI_COMPILER_CONFIG_DIR may be used to override
-# folder where *.conf files are placed
-if [ -n "$I_MPI_COMPILER_CONFIG_DIR" ] ; then
-    sysconfdir=$I_MPI_COMPILER_CONFIG_DIR;
-fi
-includedir=${prefix}/include
-libdir=${prefix}/lib
-#
-# Default settings for compiler, flags, and libraries.
-# Determined by a combination of environment variables and tests within
-# configure (e.g., determining whehter -lsocket is needee)
-CC="icc"
-CFLAGS=""
-LDFLAGS="-ldl"
-MPILIBNAME="mpi"
-
-# MPIVERSION is the version of the MPICH2 library that mpicc is intended for
-MPIVERSION="2021.10"
-#
-# Internal variables
-# Show is set to echo to cause the compilation command to be echoed instead
-# of executed.
-Show=
-static_mpi=no
-strip_debug_info=
-handle_executable=
-executable=a.out
-ilp64=no
-no_rpath=no
-#
-# End of initialization of variables
-#---------------------------------------------------------------------
-# Environment Variables.
-# The environment variables I_MPI_CC, MPICH_CC may be used to override the
-# default choices. I_MPI_CC has higher priority than MPICH_CC.
-# In addition, if there is a file $sysconfdir/mpicc-$CCname.conf,
-# where CCname is the name of the compiler with all spaces replaced by hyphens
-# (e.g., "cc -64" becomes "cc--64", that file is sources, allowing other
-# changes to the compilation environment.  See the variables used by the
-# script (defined above)
-if [ -n "$I_MPI_CC" ] ; then
-    CC="$I_MPI_CC"
-    CCname=$(echo "$CC" | sed 's/ /-/g')
-    if [ -s $sysconfdir/mpicc-$(basename $CCname).conf ] ; then
-        . $sysconfdir/mpicc-$(basename $CCname).conf
-    fi
-else
-    if [ -n "$MPICH_CC" ] ; then
-        CC="$MPICH_CC"
-        CCname=$(echo $CC | sed 's/ /-/g')
-        if [ -s $sysconfdir/mpicc-$(basename $CCname).conf ] ; then
-            . $sysconfdir/mpicc-$(basename $CCname).conf
-        fi
-    fi
-fi
-if [ -n "$I_MPI_DEBUG_INFO_STRIP" ] ; then
-    for comp_val in "0" "off" "no" "disable"
-    do
-        if [ "$I_MPI_DEBUG_INFO_STRIP" = "$comp_val" ] ; then
-            strip_debug_info=no
-            break
-        fi
-    done
-fi
-# Allow a profiling option to be selected through an environment variable
-if [ -n "$MPICC_PROFILE" ] ; then
-    profConf=$MPICC_PROFILE
-fi
-if [ -n "$I_MPI_CC_PROFILE" ] ; then
-    profConf=$I_MPI_CC_PROFILE
-fi
-
-# Override default mpi library
-if [ -n "$I_MPI_LINK" ] ; then
-    mpilib_override=$I_MPI_LINK
+if [ -z "$1" ] ; then
+    ${TARGET_WRAPPER} -help
+    exit 0
 fi
 
-#
-# ------------------------------------------------------------------------
-# Argument processing.
-# This is somewhat awkward because of the handling of arguments within
-# the shell.  We want to handle arguments that include spaces without
-# loosing the spacing (an alternative would be to use a more powerful
-# scripting language that would allow us to retain the array of values,
-# which the basic (rather than enhanced) Bourne shell does not.
-#
-# Look through the arguments for arguments that indicate compile only.
-# If these are *not* found, add the library options
-
-linking=yes
-allargs=""
 for arg in "$@" ; do
-    # Set addarg to no if this arg should be ignored by the C compiler
-    addarg=yes
-    qarg=$arg
-    if [ "x$handle_executable" = "xyes" ] ; then
-        executable=$arg
-        handle_executable=
-    fi
-    case "$arg" in
-    # ----------------------------------------------------------------
-    # Compiler options that affect whether we are linking or no
-    -c|-S|-E|-M|-MM)
-    # The compiler links by default
-        linking=no
-        ;;
-    -o )
-        handle_executable=yes
-        addarg=yes
-        ;;
-    # ----------------------------------------------------------------
-    # Options that control how we use mpicc (e.g., -show,
-    # -cc=* -config=*
-    -echo)
-        addarg=no
-        set -x
-        ;;
-    -cc=*)
-        CC=$(echo A$arg | sed -e 's/A-cc=//g')
-        addarg=no
-        ;;
-    -show)
-        addarg=no
-        Show=echo
-        ;;
-    -show_env)
-        show_env=yes
-        ;;
-    -config=*)
-        addarg=no
-        CCname=$(echo A$arg | sed -e 's/A-config=//g')
-        if [ -s "$sysconfdir/mpicc-$CCname.conf" ] ; then
-            . "$sysconfdir/mpicc-$CCname.conf"
-        else
-            echo "Configuration file mpicc-$CCname.conf not found"
-        fi
-        ;;
-    -compile-info|-compile_info)
-        # -compile_info included for backward compatibility
-        Show=echo
-        addarg=no
-        ;;
-    -link-info|-link_info)
-        # -link_info included for backward compatibility
-        Show=echo
-        addarg=no
-        ;;
-    -v)
-        # Pass this argument to the compiler as well.
-        echo "$(basename $0) for the Intel(R) MPI Library $MPIVERSION for Linux*"
-        echo "Copyright Intel Corporation."
-        # if there is only 1 argument, it must be -v.
-        if [ "$#" -eq "1" ] ; then
-            linking=no
-        fi
-        ;;
-    -V)
-        # Pass this argument to the compiler to query the compiler version.
-        if [ "$#" -eq "1" ] ; then
-            linking=no
-        fi
-        ;;
-    -profile=*)
-        # Pass the name of a profiling configuration.  As
-        # a special case, lib<name>.so or lib<name>.la may be used
-        # if the library is in $libdir
-        # Loading the profConf file is handled below
-        profConf=$(echo A$arg | sed -e 's/A-profile=//g')
-        addarg=no
-        ;;
-    -help)
-        # Print mini-help if started without parameters
-        echo "Simple script to compile and/or link MPI programs."
-        echo "Usage: $(basename $0) [options] <files>"
-        echo "----------------------------------------------------------------------------"
-        echo "The following options are supported:"
-        echo "   -cc=<name>      specify a C compiler name: i.e. -cc=icc"
-        echo "   -echo           print the scripts during their execution"
-        echo "   -show           show command lines without real calling"
-        echo "   -show_env       show environment variables"
-        echo "   -config=<name>  specify a configuration file: i.e. -config=icc for mpicc-icc.conf file"
-        echo "   -v              print version info of $(basename $0) and its native compiler"
-        echo "   -profile=<name> specify a profile configuration file (an MPI profiling"
-        echo "                   library): i.e. -profile=myprofile for the myprofile.cfg file."
-        echo "                   As a special case, lib<name>.so or lib<name>.a may be used"
-        echo "                   if the library is found"
-        echo "   -check_mpi      link against the Intel(R) Trace Collector (-profile=vtmc)."
-        echo "   -static_mpi     link the Intel(R) MPI Library statically"
-        echo "   -mt_mpi         link the thread safe version of the Intel(R) MPI Library"
-        echo "   -ilp64          link the ILP64 support of the Intel(R) MPI Library"
-        echo "   -t or -trace"
-        echo "                   link against the Intel(R) Trace Collector"
-        echo "   -trace-imbalance"
-        echo "                   link against the Intel(R) Trace Collector imbalance library"
-        echo "                   (-profile=vtim)"
-        echo "   -dynamic_log    link against the Intel(R) Trace Collector dynamically"
-        echo "   -static         use static linkage method"
-        echo "   -nostrip        turn off the debug information stripping during static linking"
-        echo "   -fast           the same as -static_mpi + pass -fast option to a compiler"
-        echo "   -O              enable optimization"
-        echo "   -link_mpi=<name>"
-        echo "                   link against the specified version of the Intel(R) MPI Library"
-        echo "                   i.e -link_mpi=opt|opt_mt|dbg|dbg_mt"
-        echo "   -norpath        disable rpath for compiler wrapper of the Intel(R) MPI Library"
-        echo "All other options will be passed to the compiler without changing."
-        echo "----------------------------------------------------------------------------"
-        echo "The following environment variables are used:"
-        echo "   I_MPI_ROOT      the Intel(R) MPI Library installation directory path"
-        echo "   I_MPI_CC or MPICH_CC"
-        echo "                   the path/name of the underlying compiler to be used"
-        echo "   I_MPI_CC_PROFILE or MPICC_PROFILE"
-        echo "                   the name of profile file (without extension)"
-        echo "   I_MPI_COMPILER_CONFIG_DIR"
-        echo "                   the folder which contains configuration files *.conf"
-        echo "   I_MPI_TRACE_PROFILE"
-        echo "                   specify a default profile for the -trace option"
-        echo "   I_MPI_CHECK_PROFILE"
-        echo "                   specify a default profile for the -check_mpi option"
-        echo "   I_MPI_LINK      specify the version of the Intel(R) MPI Library"
-        echo "   I_MPI_DEBUG_INFO_STRIP"
-        echo "                   turn on/off the debug information stripping during static linking"
-        echo "----------------------------------------------------------------------------"
-        exit 0
-        ;;
-    -nolinkage)
-        # This internal option is used by wrapper driver scripts mpicc, mpicxx, mpifc when -v option is used.
-        linking=no
-        addarg=no
-        ;;
-    -g)
-        MPILIBDIR="/release"
-        ;;
-    -static_mpi)
-        static_mpi=yes
-        CFLAGS="$CFLAGS -Xlinker --export-dynamic"
-        addarg=no
-        ;;
-    -static)
-        static_mpi=yes
-        CFLAGS="$CFLAGS -Xlinker --export-dynamic"
-        addarg=yes
-        ;;
-    -mt_mpi)
-        addarg=no
-        ;;
-    -ilp64)
-        ilp64=yes
-        addarg=no
-        ;;
-    -check_mpi)
-        if [ -z "$profConf" ]; then
-            if [ -z "$I_MPI_CHECK_PROFILE" ]; then
-                profConf="vtmc"
-            else
-                profConf="$I_MPI_CHECK_PROFILE"
-            fi
-        else
-            echo "Warning: the -check_mpi option will be ignored because the profile was set."
-        fi
-        addarg=no
-        ;;
-    -trace-imbalance)
-        if [ -z "$profConf" ]; then
-            profConf="vtim"
-        else
-            echo "Warning: the -trace-imbalance option will be ignored because the profile was set."
-        fi
-        addarg=no
-        ;;
-    -t | -trace | -t=* | -trace=* )
-        if [ -z "$profConf" ]; then
-            if [ -z "$I_MPI_TRACE_PROFILE" ]; then
-                profConf="vt"
-            else
-                profConf="$I_MPI_TRACE_PROFILE"
-            fi
-        else
-            echo "Warning: the -trace option will be ignored because the profile was set."
-        fi
-        # Disable strip to prevent debug symbols into separate dbg file in case of static linking IMPI-1493
-        strip_debug_info=no
-        addarg=no
-        ;;
-    -fast)
-        echo "Warning: the -fast option forces static linkage method for the Intel(R) MPI Library."
-        static_mpi=yes
-        CFLAGS="$CFLAGS -Xlinker --export-dynamic"
-        ;;
-    -link_mpi=* )
-        mpilib_override=`echo A$arg | sed -e 's/A-link_mpi=//g'`
-        addarg=no
-        ;;
-    -nostrip )
-        strip_debug_info=no
-        addarg=no
-        ;;
-    -norpath )
-        no_rpath=yes
-        addarg=no
-        ;;
-    # Other arguments.  We are careful to handle arguments with
-    # quotes (we try to quote all arguments in case they include
-    # any spaces)
-    *\"*)
-        qarg="'$arg'"
-        ;;
-    *\'*)
-        qarg=$(echo \"$arg\")
-        ;;
-    *)
-        qarg="'$arg'"
+    case $arg in 
+        -cc=*)
+            COMPILER=$(echo $arg | sed -e 's/-cc=//g')
+            user_set_compiler=1
         ;;
     esac
-    if [ $addarg = yes ] ; then
-        allargs="$allargs $qarg"
-    fi
 done
 
-if [ $# -eq 0 ] ; then
-    echo "Error: Command line argument is needed!"
-    "$0" -help
-    exit 1
-fi
-
-if [ -n "$mpilib_override" ] ; then
-    case "$mpilib_override" in
-    opt ) 
-        MPILIBDIR="/release"
-        ;;
-    opt_mt )
-        MPILIBDIR="/release"
-        MPILIBDIR_MT="mt"
-        ;;
-    dbg )
-        MPILIBDIR="/debug"
-        ;;
-    dbg_mt )
-        MPILIBDIR="/debug"
-        MPILIBDIR_MT="mt"
-        ;;
-    * )
-        echo "Warning: incorrect library version specified. Automatically selected library will be used."
-        ;;
-    esac
-fi
-
-# -----------------------------------------------------------------------
-case "$MPILIBDIR" in
-    release | /release | debug | /debug)
-        if [ -n "$MPILIBDIR_MT" ]; then
-            MPILIBDIR=${MPILIBDIR}_${MPILIBDIR_MT}
-        fi
-        ;;
-    "" )
-        MPILIBDIR="/release"
-        ;;
-esac
-if [ "$static_mpi" = yes ] ; then
-    mpilibs="${libdir}/libmpifort.a ${libdir}${MPILIBDIR}/lib${MPILIBNAME}.a"
-    I_MPI_OTHERLIBS=""
-    MPI_OTHERLIBS=" -lrt -lpthread "
-    if [ "$ilp64" = yes ]; then
-        mpilibs="$libdir/libmpi_ilp64.a $mpilibs"
-    fi
-    if [ "x$strip_debug_info" = "x" ] ; then
-        strip_debug_info=yes
-    fi
-else
-    mpilibs="-lmpifort -l$MPILIBNAME"
-    I_MPI_OTHERLIBS=""
-    MPI_OTHERLIBS=" -lrt -lpthread "
-    if [ "$ilp64" = yes ]; then
-        mpilibs="-lmpi_ilp64 $mpilibs"
-    fi
+if [ -n "$I_MPI_CC" ] || [ -n "$MPICH_CC" ]; then
+    user_set_compiler=1
 fi
-# Derived variables.  These are assembled from variables set from the
-# default, environment, configuration file (if any) and command-line
-# options (if any)
 
-#
-# Handle the case of a profile switch
-if [ -n "$profConf" ] ; then
-    profConffile=
-    if [ -s "$libdir/lib$profConf.a" ] || [ -s "$libdir/lib$profConf.so" ] ; then
-        mpilibs="-l$profConf $mpilibs"
-    elif [ -s "$sysconfdir/$profConf.conf" ] ; then
-        profConffile="$sysconfdir/$profConf.conf"
-    elif [ -s "$profConf.conf" ] ; then
-        profConffile="$profConf.conf"
-    else
-        echo "Profiling configuration file $profConf.conf not found in $sysconfdir"
-    fi
-    if [ -n "$profConffile" ] && [ -s "$profConffile" ] ; then
-        . $profConffile
-        if [ -n "$PROFILE_INCPATHS" ] ; then
-            CFLAGS="$PROFILE_INCPATHS $CFLAGS"
-            fi
-            if [ -n "$PROFILE_PRELIB" ] ; then
-                mpilibs="$PROFILE_PRELIB $mpilibs"
-            fi
-            if [ -n "$PROFILE_POSTLIB" ] ; then
-                mpilibs="$mpilibs $PROFILE_POSTLIB"
-            fi
-        fi
-fi
-
-# -----------------------------------------------------------------------
-#
-# A temporary statement to invoke the compiler
-# Place the -L before any args incase there are any mpi libraries in there.
-# Eventually, we'll want to move this after any non-MPI implementation
-# libraries.
-# We use a single invocation of the compiler.  This will be adequate until
-# we run into a system that uses a separate linking command.  With any luck,
-# such archaic systems are no longer with us.  This also lets us
-# accept any argument; we don't need to know if we've seen a source
-# file or an object file.  Instead, we just check for an option that
-# suppressing linking, such as -c or -M.
-
-if [ "${show_env}" = "yes" ]; then
-    env | more
-    exit 0
-fi
-
-if [ "$no_rpath" = "yes" ]; then
-    rpath_opt="-Xlinker --enable-new-dtags"
+if [ $user_set_compiler -eq 1 ] ; then 
+    ${TARGET_WRAPPER} "$@"
 else
-    rpath_opt="-Xlinker --enable-new-dtags -Xlinker -rpath -Xlinker \"${libdir}${MPILIBDIR}\" -Xlinker -rpath -Xlinker \"${libdir}\""
+    ${TARGET_WRAPPER} -cc="${COMPILER}" "$@"
 fi
-if [ "$linking" = yes ] ; then
-    cmd_line="$CC $CFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $mpilibs $I_MPI_OTHERLIBS $LDFLAGS $MPI_OTHERLIBS"
-    if [ "$Show" = echo ] ; then
-        echo $cmd_line
-    else
-        eval $(echo $cmd_line)
-    fi
-    rc=$?
-    if [ $rc -eq 0 ] && [ "x$strip_debug_info" = "xyes" ] ; then
-        $Show objcopy --only-keep-debug ${executable} ${executable}.dbg
-        $Show objcopy --strip-debug ${executable}
-        $Show objcopy --add-gnu-debuglink=${executable}.dbg ${executable}
-    fi
-else
-    cmd_line="$CC $CFLAGS $allargs -I\"${includedir}\""
-    if [ "$Show" = echo ] ; then
-        echo "$cmd_line"
-    else
-        eval $(echo $cmd_line)
-    fi
-    rc=$?
-fi
-
-exit $rc
diff --git a/deps/mpi/bin/mpiicpc b/deps/mpi/bin/mpiicpc
index aeae2b888..7942db898 100755
--- a/deps/mpi/bin/mpiicpc
+++ b/deps/mpi/bin/mpiicpc
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # Copyright Intel Corporation.
 # 
@@ -11,554 +11,32 @@
 # This software and the related documents are provided as is, with no express
 # or implied warranties, other than those that are expressly stated in the
 # License.
-#.
-# (C) 2006 by Argonne National Laboratory.
-# 
-# 				  COPYRIGHT
-# 
-# The following is a notice of limited availability of the code, and disclaimer
-# which must be included in the prologue of the code and in all source listings
-# of the code.
-# 
-# Copyright Notice
-# 1998--2020, Argonne National Laboratory
-# 
-# Permission is hereby granted to use, reproduce, prepare derivative works, and
-# to redistribute to others.  This software was authored by:
-# 
-# Mathematics and Computer Science Division
-# Argonne National Laboratory, Argonne IL 60439
-# 
-# (and)
-# 
-# Department of Computer Science
-# University of Illinois at Urbana-Champaign
-# 
-# 
-# 			      GOVERNMENT LICENSE
-# 
-# Portions of this material resulted from work developed under a U.S.
-# Government Contract and are subject to the following license: the Government
-# is granted for itself and others acting on its behalf a paid-up, nonexclusive,
-# irrevocable worldwide license in this computer software to reproduce, prepare
-# derivative works, and perform publicly and display publicly.
-# 
-# 				  DISCLAIMER
-# 
-# This computer code material was prepared, in part, as an account of work
-# sponsored by an agency of the United States Government.  Neither the United
-# States, nor the University of Chicago, nor any of their employees, makes any
-# warranty express or implied, or assumes any legal liability or responsibility
-# for the accuracy, completeness, or usefulness of any information, apparatus,
-# product, or process disclosed, or represents that its use would not infringe
-# privately owned rights.
-# 
-# 			   EXTERNAL CONTRIBUTIONS
-# 
-# Portions of this code have been contributed under the above license by:
-# 
-#  * Intel Corporation
-#  * Cray
-#  * IBM Corporation
-#  * Microsoft Corporation
-#  * Mellanox Technologies Ltd.
-#  * DataDirect Networks.
-#  * Oak Ridge National Laboratory
-#  * Sun Microsystems, Lustre group
-#  * Dolphin Interconnect Solutions Inc.
-#  * Institut Polytechnique de Bordeaux
-#
-#     .
-#
-# mpicxx
-# Simple script to compile and/or link MPI programs.
-# This script knows the default flags and libraries, and can handle
-# alternative C++ compilers and the associated flags and libraries.
-# The important terms are:
-#    includedir, libdir - Directories containing an *installed* mpich2
-#    prefix, execprefix - Often used to define includedir and libdir
-#    CXX                - C compiler
-#    WRAPPER_CXXFLAGS      - Any special flags needed to compile
-#    WRAPPER_LDFLAGS       - Any special flags needed to link
-#    MPILIBNAME         - Name of the MPI library
-#    MPICXXLIBNAME      - Name of the C++ binding part of the MPI library
-#    MPI_OTHERLIBS      - Other libraries needed in order to link
-#
-# We assume that (a) the C++ compiler can both compile and link programs
 #
-# Handling of command-line options:
-#   This is a little tricky because some options may contain blanks.
-#
-# Special issues with shared libraries - todo
-#
-# --------------------------------------------------------------------------
-
-# Set the default values of all variables.
-#
-# Directory locations: Fixed for any MPI implementation
-prefix=I_MPI_SUBSTITUTE_INSTALLDIR
-# The environment variable I_MPI_ROOT may be used to override installation folder path
-if [ -n "${I_MPI_ROOT}" ] ; then
-    prefix="${I_MPI_ROOT}";
-fi
 
-PLATFORM=""
-sysconfdir=${prefix}/etc
+COMPILER="icpc"
+TARGET_WRAPPER="mpiicpx"
+user_set_compiler=0
 
-# The environment variable I_MPI_COMPILER_CONFIG_DIR may be used to override
-# folder where *.conf files are placed
-if [ -n "$I_MPI_COMPILER_CONFIG_DIR" ] ; then
-    sysconfdir=$I_MPI_COMPILER_CONFIG_DIR;
-fi
-
-includedir=${prefix}/include
-libdir=${prefix}/lib
-
-# Default settings for compiler, flags, and libraries
-CXX="icpc"
-CXXFLAGS=""
-LDFLAGS="-ldl"
-MPILIBNAME="mpi"
-MPICXXLIBNAME="mpicxx"
-
-# MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for
-MPIVERSION="2021.10"
-
-# Internal variables
-# Show is set to echo to cause the compilation command to be echoed instead
-# of executed.
-Show=
-static_mpi=no
-strip_debug_info=
-handle_executable=
-executable=a.out
-ilp64=no
-no_rpath=no
-# End of initialization of variables
-#
-#---------------------------------------------------------------------
-# Environment Variables.
-# The environment variables I_MPI_CXX, MPICH_CXX may be used to override the
-# default choices. I_MPI_CXX has higher priority than MPICH_CXX.
-# In addition, if there is a file $sysconfdir/mpicxx-$CXXname.conf,
-# where CXXname is the name of the compiler with all spaces replaced by hyphens
-# (e.g., "CC -64" becomes "CC--64", that file is sources, allowing other
-# changes to the compilation environment.  See the variables used by the
-# script (defined above)
-
-if [ -n "$I_MPI_CXX" ] ; then
-    CXX="$I_MPI_CXX"
-    CXXname=`echo $CXX | sed 's/ /-/g'`
-    if [ -s $sysconfdir/mpicxx-$(basename $CXXname).conf ] ; then
-        . $sysconfdir/mpicxx-$(basename $CXXname).conf
-    fi
-else
-    if [ -n "$MPICH_CXX" ] ; then
-        CXX="$MPICH_CXX"
-        CXXname=`echo $CXX | sed 's/ /-/g'`
-        if [ -s $sysconfdir/mpicxx-$(basename $CXXname).conf ] ; then
-            . $sysconfdir/mpicxx-$(basename $CXXname).conf
-    fi
-    fi
-fi
-if [ -n "$I_MPI_DEBUG_INFO_STRIP" ] ; then
-    for comp_val in "0" "off" "no" "disable"
-    do
-        if [ "$I_MPI_DEBUG_INFO_STRIP" = "$comp_val" ] ; then
-            strip_debug_info=no
-            break
-        fi
-    done
-fi
-# Allow a profiling option to be selected through an environment variable
-if [ -n "$MPICXX_PROFILE" ] ; then
-    profConf=$MPICXX_PROFILE
-fi
-if [ -n "$I_MPI_CXX_PROFILE" ] ; then
-    profConf=$I_MPI_CXX_PROFILE
-fi
-
-# Override default mpi library
-if [ -n "$I_MPI_LINK" ] ; then
-    mpilib_override=$I_MPI_LINK
+if [ -z "$1" ] ; then
+    ${TARGET_WRAPPER} -help
+    exit 0
 fi
 
-#
-# ------------------------------------------------------------------------
-# Argument processing.
-# This is somewhat awkward because of the handling of arguments within
-# the shell.  We want to handle arguments that include spaces without
-# loosing the spacing (an alternative would be to use a more powerful
-# scripting language that would allow us to retain the array of values,
-# which the basic (rather than enhanced) Bourne shell does not.
-#
-# Look through the arguments for arguments that indicate compile only.
-# If these are *not* found, add the library options
-
-linking=yes
-allargs=""
 for arg in "$@" ; do
-    # Set addarg to no if this arg should be ignored by the C compiler
-    addarg=yes
-    qarg=$arg
-    if [ "x$handle_executable" = "xyes" ] ; then
-        executable=$arg
-        handle_executable=
-    fi
-    case "$arg" in
-    # ----------------------------------------------------------------
-    # Compiler options that affect whether we are linking or no
-    -c|-S|-E|-M|-MM)
-        # The compiler links by default
-        linking=no
-        ;;
-    -o )
-        handle_executable=yes
-        addarg=yes
-        ;;
-    # ----------------------------------------------------------------
-    # Options that control how we use mpicxx (e.g., -show,
-    # -cxx=* -config=*
-    -echo)
-        addarg=no
-        set -x
-        ;;
-    -cxx=*)
-        CXX=$(echo A$arg | sed -e 's/A-cxx=//g')
-        addarg=no
-        ;;
-    # Backwards compatibility for MPICH1 - scripts
-    -CC=*)
-        CXX=$(echo A$arg | sed -e 's/A-CC=//g')
-        addarg=no
-        ;;
-    -show)
-        addarg=no
-        Show=echo
-        ;;
-    -show_env)
-        show_env=yes
-        ;;
-    -config=*)
-        addarg=no
-        CXXname=$(echo A$arg | sed -e 's/A-config=//g')
-        if [ -s "$sysconfdir/mpicxx-$CXXname.conf" ] ; then
-            . "$sysconfdir/mpicxx-$CXXname.conf"
-        else
-            echo "Configuration file mpicxx-$CXXname.conf not found"
-        fi
-        ;;
-    -compile-info|-compile_info)
-        # -compile_info included for backward compatibility
-        Show=echo
-        addarg=no
-        ;;
-    -link-info|-link_info)
-        # -link_info included for backward compatibility
-        Show=echo
-        addarg=no
-        ;;
-    -v)
-        # Pass this argument to the compiler as well.
-        echo "$(basename $0) for the Intel(R) MPI Library $MPIVERSION for Linux*"
-        echo "Copyright Intel Corporation."
-        # if there is only 1 argument, it must be -v.
-        if [ "$#" -eq "1" ] ; then
-            linking=no
-        fi
-        ;;
-    -V)
-        # Pass this argument to the compiler to query the compiler version.
-        if [ "$#" -eq "1" ] ; then
-            linking=no
-        fi
-        ;;
-    -profile=*)
-        # Pass the name of a profiling configuration.  As
-        # a special case, lib<name>.so or lib<name>.la may be used
-        # if the library is in $libdir
-        profConf=$(echo A$arg | sed -e 's/A-profile=//g')
-        addarg=no
-        # Loading the profConf file is handled below
-        ;;
-    -help)
-        # Print mini-help if started without parameters
-        echo "Simple script to compile and/or link MPI programs."
-        echo "Usage: $(basename $0) [options] <files>"
-        echo "----------------------------------------------------------------------------"
-        echo "The following options are supported:"
-        echo "   -cxx=<name>     specify a C++ compiler name: i.e. -cxx=icpc"
-        echo "   -echo           print the scripts during their execution"
-        echo "   -show           show command lines without real calling"
-        echo "   -show_env       show environment variables"
-        echo "   -config=<name>  specify a configuration file: i.e. -config=icpc for mpicc-icpc.conf file"
-        echo "   -v              print version info of $(basename $0) and its native compiler"
-        echo "   -profile=<name> specify a profile configuration file (an MPI profiling"
-        echo "                   library): i.e. -profile=myprofile for the myprofile.cfg file."
-        echo "                   As a special case, lib<name>.so or lib<name>.a may be used"
-        echo "                   if the library is found"
-        echo "   -check_mpi      link against the Intel(R) Trace Collector (-profile=vtmc)."
-        echo "   -static_mpi     link the Intel(R) MPI Library statically"
-        echo "   -mt_mpi         link the thread safe version of the Intel(R) MPI Library"
-        echo "   -ilp64          link the ILP64 support of the Intel(R) MPI Library"
-        echo "   -fast           the same as -static_mpi + pass -fast option to a compiler"
-        echo "   -t or -trace"
-        echo "                   link against the Intel(R) Trace Collector"
-        echo "   -trace-imbalance"
-        echo "                   link against the Intel(R) Trace Collector imbalance library"
-        echo "                   (-profile=vtim)"
-        echo "   -static         use static linkage method"
-        echo "   -nostrip        turn off the debug information stripping during static linking"
-        echo "   -dynamic_log    link against the Intel(R) Trace Collector dynamically"
-        echo "   -O              enable optimization"
-        echo "   -link_mpi=<name>"
-        echo "                   link against the specified version of the Intel(R) MPI Library"
-        echo "                   i.e -link_mpi=opt|opt_mt|dbg|dbg_mt"
-        echo "   -norpath        disable rpath for compiler wrapper of the Intel(R) MPI Library"
-        echo "All other options will be passed to the compiler without changing."
-        echo "----------------------------------------------------------------------------"
-        echo "The following environment variables are used:"
-        echo "   I_MPI_ROOT      the Intel(R) MPI Library installation directory path"
-        echo "   I_MPI_CXX or MPICH_CXX"
-        echo "                   the path/name of the underlying compiler to be used"
-        echo "   I_MPI_CXX_PROFILE or MPICXX_PROFILE"
-        echo "                   the name of profile file (without extension)"
-        echo "   I_MPI_COMPILER_CONFIG_DIR"
-        echo "                   the folder which contains configuration files *.conf"
-        echo "   I_MPI_TRACE_PROFILE"
-        echo "                   specify a default profile for the -trace option"
-        echo "   I_MPI_CHECK_PROFILE"
-        echo "                   specify a default profile for the -check_mpi option"
-        echo "   I_MPI_LINK      specify the version of the Intel(R) MPI Library"
-        echo "   I_MPI_DEBUG_INFO_STRIP"
-        echo "                   turn on/off the debug information stripping during static linking"
-        echo "----------------------------------------------------------------------------"
-        exit 0
-        ;;
-    -nolinkage)
-        # This internal option is used by wrapper driver scripts mpicc, mpicxx, mpifc when -v option is used.
-        linking=no
-        addarg=no
-        ;;
-    -g)
-        MPILIBDIR="/release"
-        ;;
-    -static_mpi)
-        static_mpi=yes
-        CXXFLAGS="$CXXFLAGS -Xlinker --export-dynamic"
-        addarg=no
-        ;;
-    -static)
-        static_mpi=yes
-        CXXFLAGS="$CXXFLAGS -Xlinker --export-dynamic"
-        addarg=yes
-        ;;
-    -mt_mpi)
-        addarg=no
-        ;;
-    -ilp64)
-        ilp64=yes
-        addarg=no
-        ;;
-    -check_mpi)
-        if [ -z "$profConf" ]; then
-            if [ -z "$I_MPI_CHECK_PROFILE" ]; then
-                profConf="vtmc"
-            else
-                profConf="$I_MPI_CHECK_PROFILE"
-            fi
-        else
-            echo "Warning: the -check_mpi option will be ignored because the profile was set."
-        fi
-        addarg=no
-        ;;
-    -trace-imbalance)
-        if [ -z "$profConf" ]; then
-            profConf="vtim"
-        else
-            echo "Warning: the -trace-imbalance option will be ignored because the profile was set."
-        fi
-        addarg=no
-        ;;
-    -t | -trace | -t=* | -trace=* )
-        if [ -z "$profConf" ]; then
-            if [ -z "$I_MPI_TRACE_PROFILE" ]; then
-                profConf="vt"
-            else
-                profConf="$I_MPI_TRACE_PROFILE"
-            fi
-        else
-            echo "Warning: the -trace option will be ignored because the profile was set."
-        fi
-        # Disable strip to prevent debug symbols into separate dbg file in case of static linking IMPI-1493
-        strip_debug_info=no
-        addarg=no
-        ;;
-    -fast)
-        echo "Warning: the -fast option forces static linkage method for the Intel(R) MPI Library."
-        static_mpi=yes
-        CXXFLAGS="$CXXFLAGS -Xlinker --export-dynamic"
-        ;;
-    -link_mpi=* )
-        mpilib_override=$(echo A$arg | sed -e 's/A-link_mpi=//g')
-        addarg=no
-        ;;
-    -nostrip )
-        strip_debug_info=no
-        addarg=no
-        ;;
-    -norpath )
-        no_rpath=yes
-        addarg=no
-        ;;
-    # Other arguments.  We are careful to handle arguments with
-    # quotes (we try to quote all arguments in case they include
-    # any spaces)
-    *\"*)
-        qarg="'$arg'"
-        ;;
-    *\'*)
-        qarg=$(echo \"$arg\")
-        ;;
-    *)
-        qarg="'$arg'"
+    case $arg in 
+        -cxx=*)
+            COMPILER=$(echo $arg | sed -e 's/-cxx=//g')
+            user_set_compiler=1
         ;;
     esac
-    if [ $addarg = yes ] ; then
-        allargs="$allargs $qarg"
-    fi
 done
 
-if [ $# -eq 0 ] ; then
-    echo "Error: Command line argument is needed!"
-    "$0" -help
-    exit 1
-fi
-
-if [ -n "$mpilib_override" ] ; then
-    case "$mpilib_override" in
-       opt )
-           MPILIBDIR="/release"
-           ;;
-       opt_mt )
-           MPILIBDIR="/release"
-           MPILIBDIR_MT="mt"
-           ;;
-       dbg )
-           MPILIBDIR="/debug"
-           ;;
-       dbg_mt )
-           MPILIBDIR="/debug"
-           MPILIBDIR_MT="mt"
-           ;;
-       * )
-           echo "Warning: incorrect library version specified. Automatically selected library will be used."
-           ;;
-    esac
-fi
-
-# -----------------------------------------------------------------------
-case "$MPILIBDIR" in
-    release | /release | debug | /debug)
-        if [ -n "$MPILIBDIR_MT" ]; then
-            MPILIBDIR=${MPILIBDIR}_${MPILIBDIR_MT}
-        fi
-        ;;
-    "" ) MPILIBDIR="/release" ;;
-esac
-if [ "$static_mpi" = yes ] ; then
-    mpilibs="${libdir}/libmpifort.a ${libdir}${MPILIBDIR}/lib${MPILIBNAME}.a"
-    I_MPI_OTHERLIBS=""
-    MPI_OTHERLIBS=" -lrt -lpthread "
-    if [ "$ilp64" = yes ]; then
-        mpilibs="$libdir/libmpi_ilp64.a $mpilibs"
-    fi
-
-    CXX_BIND_LIB="$libdir/libmpicxx.a"
-    if [ "x$strip_debug_info" = "x" ] ; then
-        strip_debug_info=yes
-    fi
-else
-    mpilibs="-lmpifort -l$MPILIBNAME"
-    I_MPI_OTHERLIBS=""
-    MPI_OTHERLIBS=" -lrt -lpthread "
-    if [ "$ilp64" = yes ]; then
-        mpilibs="-lmpi_ilp64 $mpilibs"
-    fi
-
-    CXX_BIND_LIB="-lmpicxx"
+if [ -n "$I_MPI_CXX" ] || [ -n "$MPICH_CXX" ]; then
+    user_set_compiler=1
 fi
-# Derived variables.  These are assembled from variables set from the
-# default, environment, configuration file (if any) and command-line
-# options (if any)
 
-cxxlibs=
-if [ "$MPICXXLIBNAME" != "$MPILIBNAME" ] ; then
-    cxxlibs="$CXX_BIND_LIB"
-fi
-#
-# Handle the case of a profile switch
-if [ -n "$profConf" ] ; then
-    profConffile=
-    if [ -s "$libdir/lib$profConf.a" ] || [ -s "$libdir/lib$profConf.so" ] ; then
-        mpilibs="-l$profConf $mpilibs"
-    elif [ -s "$sysconfdir/$profConf.conf" ] ; then
-        profConffile="$sysconfdir/$profConf.conf"
-    elif [ -s "$profConf.conf" ] ; then
-        profConffile="$profConf.conf"
-    else
-        echo "Profiling configuration file $profConf.conf not found in $sysconfdir"
-    fi
-    if [ -n "$profConffile" ] && [ -s "$profConffile" ] ; then
-        . $profConffile
-        if [ -n "$PROFILE_INCPATHS" ] ; then
-            CXXFLAGS="$PROFILE_INCPATHS $CXXFLAGS"
-        fi
-        if [ -n "$PROFILE_PRELIB" ] ; then
-            mpilibs="$PROFILE_PRELIB $mpilibs"
-        fi
-        if [ -n "$PROFILE_POSTLIB" ] ; then
-            mpilibs="$mpilibs $PROFILE_POSTLIB"
-        fi
-    fi
-fi
-# A temporary statement to invoke the compiler
-# Place the -L before any args incase there are any mpi libraries in there.
-# Eventually, we'll want to move this after any non-MPI implementation
-# libraries
-
-if [ "${show_env}" = "yes" ]; then
-    env | more
-    exit 0
-fi
-
-if [ "$no_rpath" = "yes" ]; then
-    rpath_opt="-Xlinker --enable-new-dtags"
-else
-    rpath_opt="-Xlinker --enable-new-dtags -Xlinker -rpath -Xlinker \"${libdir}${MPILIBDIR}\" -Xlinker -rpath -Xlinker \"${libdir}\""
-fi
-if [ "$linking" = yes ] ; then
-    cmd_line="$CXX $CXXFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $cxxlibs $mpilibs $I_MPI_OTHERLIBS $LDFLAGS $MPI_OTHERLIBS"
-    if [ "$Show" = echo ] ; then
-        echo $cmd_line
-    else
-        eval `echo $cmd_line`
-    fi
-    rc=$?
-    if [ $rc -eq 0 ] && [ "x$strip_debug_info" = "xyes" ] ; then
-        $Show objcopy --only-keep-debug ${executable} ${executable}.dbg
-        $Show objcopy --strip-debug ${executable}
-        $Show objcopy --add-gnu-debuglink=${executable}.dbg ${executable}
-    fi
+if [ $user_set_compiler -eq 1 ] ; then
+    ${TARGET_WRAPPER} "$@"
 else
-    cmd_line="$CXX $CXXFLAGS $allargs -I\"${includedir}\""
-    if [ "$Show" = echo ] ; then
-        $Show $cmd_line
-    else
-        eval `echo $cmd_line`
-    fi
-    rc=$?
+    ${TARGET_WRAPPER} -cxx="${COMPILER}" "$@"
 fi
-
-exit $rc
-
diff --git a/deps/mpi/bin/mpiicpx b/deps/mpi/bin/mpiicpx
new file mode 100755
index 000000000..3b8111902
--- /dev/null
+++ b/deps/mpi/bin/mpiicpx
@@ -0,0 +1,565 @@
+#!/bin/sh
+#
+# Copyright Intel Corporation.
+# 
+# This software and the related documents are Intel copyrighted materials, and
+# your use of them is governed by the express license under which they were
+# provided to you (License). Unless the License provides otherwise, you may
+# not use, modify, copy, publish, distribute, disclose or transmit this
+# software or the related documents without Intel's prior written permission.
+# 
+# This software and the related documents are provided as is, with no express
+# or implied warranties, other than those that are expressly stated in the
+# License.
+#.
+# (C) 2006 by Argonne National Laboratory.
+# 
+# 				  COPYRIGHT
+# 
+# The following is a notice of limited availability of the code, and disclaimer
+# which must be included in the prologue of the code and in all source listings
+# of the code.
+# 
+# Copyright Notice
+# 1998--2020, Argonne National Laboratory
+# 
+# Permission is hereby granted to use, reproduce, prepare derivative works, and
+# to redistribute to others.  This software was authored by:
+# 
+# Mathematics and Computer Science Division
+# Argonne National Laboratory, Argonne IL 60439
+# 
+# (and)
+# 
+# Department of Computer Science
+# University of Illinois at Urbana-Champaign
+# 
+# 
+# 			      GOVERNMENT LICENSE
+# 
+# Portions of this material resulted from work developed under a U.S.
+# Government Contract and are subject to the following license: the Government
+# is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+# irrevocable worldwide license in this computer software to reproduce, prepare
+# derivative works, and perform publicly and display publicly.
+# 
+# 				  DISCLAIMER
+# 
+# This computer code material was prepared, in part, as an account of work
+# sponsored by an agency of the United States Government.  Neither the United
+# States, nor the University of Chicago, nor any of their employees, makes any
+# warranty express or implied, or assumes any legal liability or responsibility
+# for the accuracy, completeness, or usefulness of any information, apparatus,
+# product, or process disclosed, or represents that its use would not infringe
+# privately owned rights.
+# 
+# 			   EXTERNAL CONTRIBUTIONS
+# 
+# Portions of this code have been contributed under the above license by:
+# 
+#  * Intel Corporation
+#  * Cray
+#  * IBM Corporation
+#  * Microsoft Corporation
+#  * Mellanox Technologies Ltd.
+#  * DataDirect Networks.
+#  * Oak Ridge National Laboratory
+#  * Sun Microsystems, Lustre group
+#  * Dolphin Interconnect Solutions Inc.
+#  * Institut Polytechnique de Bordeaux
+#
+#     .
+#
+# mpicxx
+# Simple script to compile and/or link MPI programs.
+# This script knows the default flags and libraries, and can handle
+# alternative C++ compilers and the associated flags and libraries.
+# The important terms are:
+#    includedir, libdir - Directories containing an *installed* mpich2
+#    prefix, execprefix - Often used to define includedir and libdir
+#    CXX                - C compiler
+#    WRAPPER_CXXFLAGS      - Any special flags needed to compile
+#    WRAPPER_LDFLAGS       - Any special flags needed to link
+#    MPILIBNAME         - Name of the MPI library
+#    MPICXXLIBNAME      - Name of the C++ binding part of the MPI library
+#    MPI_OTHERLIBS      - Other libraries needed in order to link
+#
+# We assume that (a) the C++ compiler can both compile and link programs
+#
+# Handling of command-line options:
+#   This is a little tricky because some options may contain blanks.
+#
+# Special issues with shared libraries - todo
+#
+# --------------------------------------------------------------------------
+
+# Set the default values of all variables.
+#
+# Directory locations: Fixed for any MPI implementation
+prefix=""
+# The environment variable I_MPI_ROOT may be used to override installation folder path
+if [ -n "${I_MPI_ROOT}" ] ; then
+    prefix="${I_MPI_ROOT}";
+fi
+
+sysconfdir=${prefix}/etc
+includedir=${prefix}/include
+libdir=${prefix}/lib
+if [ ! -f "${I_MPI_ROOT}/lib/mpi/debug/libmpi.so" ]; then
+    release_lib_dir="/release"
+    debug_lib_dir="/debug"
+else
+    sysconfdir=${prefix}/opt/mpi/etc
+    release_lib_dir=""
+    debug_lib_dir="/mpi/debug"
+fi
+MPILIBDIR=${release_lib_dir}
+
+# The environment variable I_MPI_COMPILER_CONFIG_DIR may be used to override
+# folder where *.conf files are placed
+if [ -n "$I_MPI_COMPILER_CONFIG_DIR" ] ; then
+    sysconfdir=$I_MPI_COMPILER_CONFIG_DIR;
+fi
+
+# Default settings for compiler, flags, and libraries
+CXX="icpx"
+CXXFLAGS=""
+LDFLAGS="-ldl"
+MPILIBNAME="mpi"
+MPICXXLIBNAME="mpicxx"
+
+# MPIVERSION is the version of the Intel(R) MPI Library that mpiicpc is intended for
+MPIVERSION="@IMPI_OFFICIALVERSION@"
+
+# Internal variables
+# Show is set to echo to cause the compilation command to be echoed instead
+# of executed.
+Show=
+static_mpi=no
+strip_debug_info=
+handle_executable=
+executable=a.out
+ilp64=no
+no_rpath=no
+# End of initialization of variables
+#
+#---------------------------------------------------------------------
+# Environment Variables.
+# The environment variables I_MPI_CXX, MPICH_CXX may be used to override the
+# default choices. I_MPI_CXX has higher priority than MPICH_CXX.
+# In addition, if there is a file $sysconfdir/mpicxx-$CXXname.conf,
+# where CXXname is the name of the compiler with all spaces replaced by hyphens
+# (e.g., "CC -64" becomes "CC--64", that file is sources, allowing other
+# changes to the compilation environment.  See the variables used by the
+# script (defined above)
+
+if [ -n "$I_MPI_CXX" ] ; then
+    CXX="$I_MPI_CXX"
+    CXXname=`echo $CXX | sed 's/ /-/g'`
+    if [ -s $sysconfdir/mpicxx-$(basename $CXXname).conf ] ; then
+        . $sysconfdir/mpicxx-$(basename $CXXname).conf
+    fi
+else
+    if [ -n "$MPICH_CXX" ] ; then
+        CXX="$MPICH_CXX"
+        CXXname=`echo $CXX | sed 's/ /-/g'`
+        if [ -s $sysconfdir/mpicxx-$(basename $CXXname).conf ] ; then
+            . $sysconfdir/mpicxx-$(basename $CXXname).conf
+    fi
+    fi
+fi
+if [ -n "$I_MPI_DEBUG_INFO_STRIP" ] ; then
+    for comp_val in "0" "off" "no" "disable"
+    do
+        if [ "$I_MPI_DEBUG_INFO_STRIP" = "$comp_val" ] ; then
+            strip_debug_info=no
+            break
+        fi
+    done
+fi
+# Allow a profiling option to be selected through an environment variable
+if [ -n "$MPICXX_PROFILE" ] ; then
+    profConf=$MPICXX_PROFILE
+fi
+if [ -n "$I_MPI_CXX_PROFILE" ] ; then
+    profConf=$I_MPI_CXX_PROFILE
+fi
+
+# Override default mpi library
+if [ -n "$I_MPI_LINK" ] ; then
+    mpilib_override=$I_MPI_LINK
+fi
+
+#
+# ------------------------------------------------------------------------
+# Argument processing.
+# This is somewhat awkward because of the handling of arguments within
+# the shell.  We want to handle arguments that include spaces without
+# loosing the spacing (an alternative would be to use a more powerful
+# scripting language that would allow us to retain the array of values,
+# which the basic (rather than enhanced) Bourne shell does not.
+#
+# Look through the arguments for arguments that indicate compile only.
+# If these are *not* found, add the library options
+
+linking=yes
+allargs=""
+for arg in "$@" ; do
+    # Set addarg to no if this arg should be ignored by the C compiler
+    addarg=yes
+    qarg=$arg
+    if [ "x$handle_executable" = "xyes" ] ; then
+        executable=$arg
+        handle_executable=
+    fi
+    case "$arg" in
+    # ----------------------------------------------------------------
+    # Compiler options that affect whether we are linking or no
+    -c|-S|-E|-M|-MM)
+        # The compiler links by default
+        linking=no
+        ;;
+    -o )
+        handle_executable=yes
+        addarg=yes
+        ;;
+    # ----------------------------------------------------------------
+    # Options that control how we use mpicxx (e.g., -show,
+    # -cxx=* -config=*
+    -echo)
+        addarg=no
+        set -x
+        ;;
+    -cxx=*)
+        CXX=$(echo A$arg | sed -e 's/A-cxx=//g')
+        if [ "$#" -eq "1" ] ; then
+            echo "Error: extra arguments required"
+            echo "usage: $(basename $0) -cxx=<name> -v"
+            exit 1
+        fi
+        addarg=no
+        ;;
+    # Backwards compatibility for MPICH1 - scripts
+    -CC=*)
+        CXX=$(echo A$arg | sed -e 's/A-CC=//g')
+        if [ "$#" -eq "1" ] ; then
+            echo "Error: extra arguments required"
+            echo "usage: $(basename $0) -CC=<name> -v"
+            exit 1
+        fi
+        addarg=no
+        ;;
+    -show)
+        addarg=no
+        Show=echo
+        ;;
+    -show_env)
+        show_env=yes
+        ;;
+    -config=*)
+        addarg=no
+        CXXname=$(echo A$arg | sed -e 's/A-config=//g')
+        if [ -s "$sysconfdir/mpicxx-$CXXname.conf" ] ; then
+            . "$sysconfdir/mpicxx-$CXXname.conf"
+        else
+            echo "Configuration file mpicxx-$CXXname.conf not found"
+        fi
+        ;;
+    -compile-info|-compile_info)
+        # -compile_info included for backward compatibility
+        Show=echo
+        addarg=no
+        ;;
+    -link-info|-link_info)
+        # -link_info included for backward compatibility
+        Show=echo
+        addarg=no
+        ;;
+    -v)
+        # Pass this argument to the compiler as well.
+        echo "$(basename $0) for the Intel(R) MPI Library $MPIVERSION for Linux*"
+        echo "Copyright Intel Corporation."
+        linking=no
+        ;;
+    -V)
+        # Pass this argument to the compiler to query the compiler version.
+        linking=no
+        ;;
+    -profile=*)
+        # Pass the name of a profiling configuration.  As
+        # a special case, lib<name>.so or lib<name>.la may be used
+        # if the library is in $libdir
+        profConf=$(echo A$arg | sed -e 's/A-profile=//g')
+        addarg=no
+        # Loading the profConf file is handled below
+        ;;
+    -help)
+        # Print mini-help if started without parameters
+        echo "Simple script to compile and/or link MPI programs."
+        echo "Usage: $(basename $0) [options] <files>"
+        echo "----------------------------------------------------------------------------"
+        echo "The following options are supported:"
+        echo "   -cxx=<name>     specify a C++ compiler name: i.e. -cxx=icpc"
+        echo "   -echo           print the scripts during their execution"
+        echo "   -show           show command lines without real calling"
+        echo "   -show_env       show environment variables"
+        echo "   -config=<name>  specify a configuration file: i.e. -config=icpc for mpicc-icpc.conf file"
+        echo "   -v              print version info of $(basename $0) and its native compiler"
+        echo "   -profile=<name> specify a profile configuration file (an MPI profiling"
+        echo "                   library): i.e. -profile=myprofile for the myprofile.cfg file."
+        echo "                   As a special case, lib<name>.so or lib<name>.a may be used"
+        echo "                   if the library is found"
+        echo "   -check_mpi      link against the Intel(R) Trace Collector (-profile=vtmc)."
+        echo "   -static_mpi     link the Intel(R) MPI Library statically"
+        echo "   -mt_mpi         link the thread safe version of the Intel(R) MPI Library"
+        echo "   -ilp64          link the ILP64 support of the Intel(R) MPI Library"
+        echo "   -fast           the same as -static_mpi + pass -fast option to a compiler"
+        echo "   -t or -trace"
+        echo "                   link against the Intel(R) Trace Collector"
+        echo "   -trace-imbalance"
+        echo "                   link against the Intel(R) Trace Collector imbalance library"
+        echo "                   (-profile=vtim)"
+        echo "   -static         use static linkage method"
+        echo "   -nostrip        turn off the debug information stripping during static linking"
+        echo "   -dynamic_log    link against the Intel(R) Trace Collector dynamically"
+        echo "   -O              enable optimization"
+        echo "   -link_mpi=<name>"
+        echo "                   link against the specified version of the Intel(R) MPI Library"
+        echo "                   i.e -link_mpi=opt|opt_mt|dbg|dbg_mt"
+        echo "   -norpath        disable rpath for compiler wrapper of the Intel(R) MPI Library"
+        echo "All other options will be passed to the compiler without changing."
+        echo "----------------------------------------------------------------------------"
+        echo "The following environment variables are used:"
+        echo "   I_MPI_ROOT      the Intel(R) MPI Library installation directory path"
+        echo "   I_MPI_CXX or MPICH_CXX"
+        echo "                   the path/name of the underlying compiler to be used"
+        echo "   I_MPI_CXX_PROFILE or MPICXX_PROFILE"
+        echo "                   the name of profile file (without extension)"
+        echo "   I_MPI_COMPILER_CONFIG_DIR"
+        echo "                   the folder which contains configuration files *.conf"
+        echo "   I_MPI_TRACE_PROFILE"
+        echo "                   specify a default profile for the -trace option"
+        echo "   I_MPI_CHECK_PROFILE"
+        echo "                   specify a default profile for the -check_mpi option"
+        echo "   I_MPI_LINK      specify the version of the Intel(R) MPI Library"
+        echo "   I_MPI_DEBUG_INFO_STRIP"
+        echo "                   turn on/off the debug information stripping during static linking"
+        echo "----------------------------------------------------------------------------"
+        exit 0
+        ;;
+    -nolinkage)
+        # This internal option is used by wrapper driver scripts mpicc, mpicxx, mpifc when -v option is used.
+        linking=no
+        addarg=no
+        ;;
+    -g)
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    -static_mpi)
+        static_mpi=yes
+        CXXFLAGS="$CXXFLAGS -Xlinker --export-dynamic"
+        addarg=no
+        ;;
+    -static)
+        static_mpi=yes
+        CXXFLAGS="$CXXFLAGS -Xlinker --export-dynamic"
+        addarg=yes
+        ;;
+    -mt_mpi)
+        addarg=no
+        ;;
+    -ilp64)
+        ilp64=yes
+        addarg=no
+        ;;
+    -check_mpi)
+        if [ -z "$profConf" ]; then
+            if [ -z "$I_MPI_CHECK_PROFILE" ]; then
+                profConf="vtmc"
+            else
+                profConf="$I_MPI_CHECK_PROFILE"
+            fi
+        else
+            echo "Warning: the -check_mpi option will be ignored because the profile was set."
+        fi
+        addarg=no
+        ;;
+    -trace-imbalance)
+        if [ -z "$profConf" ]; then
+            profConf="vtim"
+        else
+            echo "Warning: the -trace-imbalance option will be ignored because the profile was set."
+        fi
+        addarg=no
+        ;;
+    -t | -trace | -t=* | -trace=* )
+        if [ -z "$profConf" ]; then
+            if [ -z "$I_MPI_TRACE_PROFILE" ]; then
+                profConf="vt"
+            else
+                profConf="$I_MPI_TRACE_PROFILE"
+            fi
+        else
+            echo "Warning: the -trace option will be ignored because the profile was set."
+        fi
+        # Disable strip to prevent debug symbols into separate dbg file in case of static linking IMPI-1493
+        strip_debug_info=no
+        addarg=no
+        ;;
+    -fast)
+        echo "Warning: the -fast option forces static linkage method for the Intel(R) MPI Library."
+        static_mpi=yes
+        CXXFLAGS="$CXXFLAGS -Xlinker --export-dynamic"
+        ;;
+    -link_mpi=* )
+        mpilib_override=$(echo A$arg | sed -e 's/A-link_mpi=//g')
+        addarg=no
+        ;;
+    -nostrip )
+        strip_debug_info=no
+        addarg=no
+        ;;
+    -norpath )
+        no_rpath=yes
+        addarg=no
+        ;;
+    # Other arguments.  We are careful to handle arguments with
+    # quotes (we try to quote all arguments in case they include
+    # any spaces)
+    *\"*)
+        qarg="'$arg'"
+        ;;
+    *\'*)
+        qarg=$(echo \"$arg\")
+        ;;
+    *)
+        qarg="'$arg'"
+        ;;
+    esac
+    if [ $addarg = yes ] ; then
+        allargs="$allargs $qarg"
+    fi
+done
+
+if [ $# -eq 0 ] ; then
+    echo "Error: Command line argument is needed!"
+    "$0" -help
+    exit 1
+fi
+
+if [ -n "$mpilib_override" ] ; then
+    case "$mpilib_override" in
+    opt ) 
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    opt_mt )
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    dbg )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    dbg_mt )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    * )
+        echo "Warning: incorrect library version specified. Automatically selected library will be used."
+        ;;
+    esac
+fi
+
+if [ "$static_mpi" = yes ] ; then
+    mpilibs="${libdir}/libmpifort.a ${libdir}${MPILIBDIR}/lib${MPILIBNAME}.a"
+    I_MPI_OTHERLIBS=""
+    MPI_OTHERLIBS=" -lrt -lpthread "
+    if [ "$ilp64" = yes ]; then
+        mpilibs="$libdir/libmpi_ilp64.a $mpilibs"
+    fi
+
+    CXX_BIND_LIB="$libdir/libmpicxx.a"
+    if [ "x$strip_debug_info" = "x" ] ; then
+        strip_debug_info=yes
+    fi
+else
+    mpilibs="-lmpifort -l$MPILIBNAME"
+    I_MPI_OTHERLIBS=""
+    MPI_OTHERLIBS=" -lrt -lpthread "
+    if [ "$ilp64" = yes ]; then
+        mpilibs="-lmpi_ilp64 $mpilibs"
+    fi
+
+    CXX_BIND_LIB="-lmpicxx"
+fi
+# Derived variables.  These are assembled from variables set from the
+# default, environment, configuration file (if any) and command-line
+# options (if any)
+
+cxxlibs=
+if [ "$MPICXXLIBNAME" != "$MPILIBNAME" ] ; then
+    cxxlibs="$CXX_BIND_LIB"
+fi
+#
+# Handle the case of a profile switch
+if [ -n "$profConf" ] ; then
+    profConffile=
+    if [ -s "$libdir/lib$profConf.a" ] || [ -s "$libdir/lib$profConf.so" ] ; then
+        mpilibs="-l$profConf $mpilibs"
+    elif [ -s "$sysconfdir/$profConf.conf" ] ; then
+        profConffile="$sysconfdir/$profConf.conf"
+    elif [ -s "$profConf.conf" ] ; then
+        profConffile="$profConf.conf"
+    else
+        echo "Profiling configuration file $profConf.conf not found in $sysconfdir"
+    fi
+    if [ -n "$profConffile" ] && [ -s "$profConffile" ] ; then
+        . $profConffile
+        if [ -n "$PROFILE_INCPATHS" ] ; then
+            CXXFLAGS="$PROFILE_INCPATHS $CXXFLAGS"
+        fi
+        if [ -n "$PROFILE_PRELIB" ] ; then
+            mpilibs="$PROFILE_PRELIB $mpilibs"
+        fi
+        if [ -n "$PROFILE_POSTLIB" ] ; then
+            mpilibs="$mpilibs $PROFILE_POSTLIB"
+        fi
+    fi
+fi
+# A temporary statement to invoke the compiler
+# Place the -L before any args incase there are any mpi libraries in there.
+# Eventually, we'll want to move this after any non-MPI implementation
+# libraries
+
+if [ "${show_env}" = "yes" ]; then
+    env | more
+    exit 0
+fi
+
+if [ "$no_rpath" = "yes" ]; then
+    rpath_opt="-Xlinker --enable-new-dtags"
+else
+    rpath_opt="-Xlinker --enable-new-dtags -Xlinker -rpath -Xlinker \"${libdir}${MPILIBDIR}\" -Xlinker -rpath -Xlinker \"${libdir}\""
+fi
+if [ "$linking" = yes ] ; then
+    cmd_line="$CXX $CXXFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $cxxlibs $mpilibs $I_MPI_OTHERLIBS $LDFLAGS $MPI_OTHERLIBS"
+    if [ "$Show" = echo ] ; then
+        echo $cmd_line
+    else
+        eval `echo $cmd_line`
+    fi
+    rc=$?
+    if [ $rc -eq 0 ] && [ "x$strip_debug_info" = "xyes" ] ; then
+        $Show objcopy --only-keep-debug ${executable} ${executable}.dbg
+        $Show objcopy --strip-debug ${executable}
+        $Show objcopy --add-gnu-debuglink=${executable}.dbg ${executable}
+    fi
+else
+    cmd_line="$CXX $CXXFLAGS $allargs -I\"${includedir}\""
+    if [ "$Show" = echo ] ; then
+        $Show $cmd_line
+    else
+        eval `echo $cmd_line`
+    fi
+    rc=$?
+fi
+
+exit $rc
+
diff --git a/deps/mpi/bin/mpiicx b/deps/mpi/bin/mpiicx
new file mode 100755
index 000000000..196c4fd28
--- /dev/null
+++ b/deps/mpi/bin/mpiicx
@@ -0,0 +1,557 @@
+#!/bin/sh
+#
+# Copyright Intel Corporation.
+# 
+# This software and the related documents are Intel copyrighted materials, and
+# your use of them is governed by the express license under which they were
+# provided to you (License). Unless the License provides otherwise, you may
+# not use, modify, copy, publish, distribute, disclose or transmit this
+# software or the related documents without Intel's prior written permission.
+# 
+# This software and the related documents are provided as is, with no express
+# or implied warranties, other than those that are expressly stated in the
+# License.
+#.
+# (C) 2006 by Argonne National Laboratory.
+# 
+# 				  COPYRIGHT
+# 
+# The following is a notice of limited availability of the code, and disclaimer
+# which must be included in the prologue of the code and in all source listings
+# of the code.
+# 
+# Copyright Notice
+# 1998--2020, Argonne National Laboratory
+# 
+# Permission is hereby granted to use, reproduce, prepare derivative works, and
+# to redistribute to others.  This software was authored by:
+# 
+# Mathematics and Computer Science Division
+# Argonne National Laboratory, Argonne IL 60439
+# 
+# (and)
+# 
+# Department of Computer Science
+# University of Illinois at Urbana-Champaign
+# 
+# 
+# 			      GOVERNMENT LICENSE
+# 
+# Portions of this material resulted from work developed under a U.S.
+# Government Contract and are subject to the following license: the Government
+# is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+# irrevocable worldwide license in this computer software to reproduce, prepare
+# derivative works, and perform publicly and display publicly.
+# 
+# 				  DISCLAIMER
+# 
+# This computer code material was prepared, in part, as an account of work
+# sponsored by an agency of the United States Government.  Neither the United
+# States, nor the University of Chicago, nor any of their employees, makes any
+# warranty express or implied, or assumes any legal liability or responsibility
+# for the accuracy, completeness, or usefulness of any information, apparatus,
+# product, or process disclosed, or represents that its use would not infringe
+# privately owned rights.
+# 
+# 			   EXTERNAL CONTRIBUTIONS
+# 
+# Portions of this code have been contributed under the above license by:
+# 
+#  * Intel Corporation
+#  * Cray
+#  * IBM Corporation
+#  * Microsoft Corporation
+#  * Mellanox Technologies Ltd.
+#  * DataDirect Networks.
+#  * Oak Ridge National Laboratory
+#  * Sun Microsystems, Lustre group
+#  * Dolphin Interconnect Solutions Inc.
+#  * Institut Polytechnique de Bordeaux
+#
+#     .
+#
+# mpicc
+# Simple script to compile and/or link MPI programs.
+# This script knows the default flags and libraries, and can handle
+# alternative C compilers and the associated flags and libraries.
+# The important terms are:
+#    includedir, libdir - Directories containing an *installed* mpich2
+#    prefix, execprefix - Often used to define includedir and libdir
+#    CC                 - C compiler
+#    WRAPPER_CFLAGS        - Any special flags needed to compile
+#    WRAPPER_LDFLAGS       - Any special flags needed to link
+#    MPILIBNAME         - Name of the MPI library
+#    MPI_OTHERLIBS      - Other libraries needed in order to link
+#
+# We assume that (a) the C compiler can both compile and link programs
+#
+# Handling of command-line options:
+#   This is a little tricky because some options may contain blanks.
+#
+# Special issues with shared libraries - todo
+#
+# --------------------------------------------------------------------------
+
+# Set the default values of all variables.
+#
+# Directory locations: Fixed for any MPI implementation.
+# Set from the directory arguments to configure (e.g., --prefix=/usr/local)
+prefix=""
+# The environment variable I_MPI_ROOT may be used to override installation folder path
+if [ -n "${I_MPI_ROOT}" ] ; then
+    prefix="${I_MPI_ROOT}";
+fi
+
+sysconfdir=${prefix}/etc
+includedir=${prefix}/include
+libdir=${prefix}/lib
+if [ ! -f "${I_MPI_ROOT}/lib/mpi/debug/libmpi.so" ]; then
+    release_lib_dir="/release"
+    debug_lib_dir="/debug"
+else
+    sysconfdir=${prefix}/opt/mpi/etc
+    release_lib_dir=""
+    debug_lib_dir="/mpi/debug"
+fi
+MPILIBDIR=${release_lib_dir}
+
+# The environment variable I_MPI_COMPILER_CONFIG_DIR may be used to override
+# folder where *.conf files are placed
+if [ -n "$I_MPI_COMPILER_CONFIG_DIR" ] ; then
+    sysconfdir=$I_MPI_COMPILER_CONFIG_DIR;
+fi
+
+#
+# Default settings for compiler, flags, and libraries.
+# Determined by a combination of environment variables and tests within
+# configure (e.g., determining whehter -lsocket is needee)
+CC="icx"
+CFLAGS=""
+LDFLAGS="-ldl"
+MPILIBNAME="mpi"
+
+# MPIVERSION is the version of the MPICH2 library that mpicc is intended for
+MPIVERSION="@IMPI_OFFICIALVERSION@"
+#
+# Internal variables
+# Show is set to echo to cause the compilation command to be echoed instead
+# of executed.
+Show=
+static_mpi=no
+strip_debug_info=
+handle_executable=
+executable=a.out
+ilp64=no
+no_rpath=no
+#
+# End of initialization of variables
+#---------------------------------------------------------------------
+# Environment Variables.
+# The environment variables I_MPI_CC, MPICH_CC may be used to override the
+# default choices. I_MPI_CC has higher priority than MPICH_CC.
+# In addition, if there is a file $sysconfdir/mpicc-$CCname.conf,
+# where CCname is the name of the compiler with all spaces replaced by hyphens
+# (e.g., "cc -64" becomes "cc--64", that file is sources, allowing other
+# changes to the compilation environment.  See the variables used by the
+# script (defined above)
+if [ -n "$I_MPI_CC" ] ; then
+    CC="$I_MPI_CC"
+    CCname=$(echo "$CC" | sed 's/ /-/g')
+    if [ -s $sysconfdir/mpicc-$(basename $CCname).conf ] ; then
+        . $sysconfdir/mpicc-$(basename $CCname).conf
+    fi
+else
+    if [ -n "$MPICH_CC" ] ; then
+        CC="$MPICH_CC"
+        CCname=$(echo $CC | sed 's/ /-/g')
+        if [ -s $sysconfdir/mpicc-$(basename $CCname).conf ] ; then
+            . $sysconfdir/mpicc-$(basename $CCname).conf
+        fi
+    fi
+fi
+if [ -n "$I_MPI_DEBUG_INFO_STRIP" ] ; then
+    for comp_val in "0" "off" "no" "disable"
+    do
+        if [ "$I_MPI_DEBUG_INFO_STRIP" = "$comp_val" ] ; then
+            strip_debug_info=no
+            break
+        fi
+    done
+fi
+# Allow a profiling option to be selected through an environment variable
+if [ -n "$MPICC_PROFILE" ] ; then
+    profConf=$MPICC_PROFILE
+fi
+if [ -n "$I_MPI_CC_PROFILE" ] ; then
+    profConf=$I_MPI_CC_PROFILE
+fi
+
+# Override default mpi library
+if [ -n "$I_MPI_LINK" ] ; then
+    mpilib_override=$I_MPI_LINK
+fi
+
+#
+# ------------------------------------------------------------------------
+# Argument processing.
+# This is somewhat awkward because of the handling of arguments within
+# the shell.  We want to handle arguments that include spaces without
+# loosing the spacing (an alternative would be to use a more powerful
+# scripting language that would allow us to retain the array of values,
+# which the basic (rather than enhanced) Bourne shell does not.
+#
+# Look through the arguments for arguments that indicate compile only.
+# If these are *not* found, add the library options
+
+linking=yes
+allargs=""
+for arg in "$@" ; do
+    # Set addarg to no if this arg should be ignored by the C compiler
+    addarg=yes
+    qarg=$arg
+    if [ "x$handle_executable" = "xyes" ] ; then
+        executable=$arg
+        handle_executable=
+    fi
+    case "$arg" in
+    # ----------------------------------------------------------------
+    # Compiler options that affect whether we are linking or no
+    -c|-S|-E|-M|-MM)
+    # The compiler links by default
+        linking=no
+        ;;
+    -o )
+        handle_executable=yes
+        addarg=yes
+        ;;
+    # ----------------------------------------------------------------
+    # Options that control how we use mpicc (e.g., -show,
+    # -cc=* -config=*
+    -echo)
+        addarg=no
+        set -x
+        ;;
+    -cc=*)
+        CC=$(echo A$arg | sed -e 's/A-cc=//g')
+        if [ "$#" -eq "1" ] ; then
+            echo "Error: extra arguments required"
+            echo "usage: $(basename $0) -cc=<name> -v"
+            exit 1
+        fi
+        addarg=no
+        ;;
+    -show)
+        addarg=no
+        Show=echo
+        ;;
+    -show_env)
+        show_env=yes
+        ;;
+    -config=*)
+        addarg=no
+        CCname=$(echo A$arg | sed -e 's/A-config=//g')
+        if [ -s "$sysconfdir/mpicc-$CCname.conf" ] ; then
+            . "$sysconfdir/mpicc-$CCname.conf"
+        else
+            echo "Configuration file mpicc-$CCname.conf not found"
+        fi
+        ;;
+    -compile-info|-compile_info)
+        # -compile_info included for backward compatibility
+        Show=echo
+        addarg=no
+        ;;
+    -link-info|-link_info)
+        # -link_info included for backward compatibility
+        Show=echo
+        addarg=no
+        ;;
+    -v)
+        # Pass this argument to the compiler as well.
+        echo "$(basename $0) for the Intel(R) MPI Library $MPIVERSION for Linux*"
+        echo "Copyright Intel Corporation."
+        linking=no
+        ;;
+    -V)
+        # Pass this argument to the compiler to query the compiler version.
+        linking=no
+        ;;
+    -profile=*)
+        # Pass the name of a profiling configuration.  As
+        # a special case, lib<name>.so or lib<name>.la may be used
+        # if the library is in $libdir
+        # Loading the profConf file is handled below
+        profConf=$(echo A$arg | sed -e 's/A-profile=//g')
+        addarg=no
+        ;;
+    -help)
+        # Print mini-help if started without parameters
+        echo "Simple script to compile and/or link MPI programs."
+        echo "Usage: $(basename $0) [options] <files>"
+        echo "----------------------------------------------------------------------------"
+        echo "The following options are supported:"
+        echo "   -cc=<name>      specify a C compiler name: i.e. -cc=icc"
+        echo "   -echo           print the scripts during their execution"
+        echo "   -show           show command lines without real calling"
+        echo "   -show_env       show environment variables"
+        echo "   -config=<name>  specify a configuration file: i.e. -config=icc for mpicc-icc.conf file"
+        echo "   -v              print version info of $(basename $0) and its native compiler"
+        echo "   -profile=<name> specify a profile configuration file (an MPI profiling"
+        echo "                   library): i.e. -profile=myprofile for the myprofile.cfg file."
+        echo "                   As a special case, lib<name>.so or lib<name>.a may be used"
+        echo "                   if the library is found"
+        echo "   -check_mpi      link against the Intel(R) Trace Collector (-profile=vtmc)."
+        echo "   -static_mpi     link the Intel(R) MPI Library statically"
+        echo "   -mt_mpi         link the thread safe version of the Intel(R) MPI Library"
+        echo "   -ilp64          link the ILP64 support of the Intel(R) MPI Library"
+        echo "   -t or -trace"
+        echo "                   link against the Intel(R) Trace Collector"
+        echo "   -trace-imbalance"
+        echo "                   link against the Intel(R) Trace Collector imbalance library"
+        echo "                   (-profile=vtim)"
+        echo "   -dynamic_log    link against the Intel(R) Trace Collector dynamically"
+        echo "   -static         use static linkage method"
+        echo "   -nostrip        turn off the debug information stripping during static linking"
+        echo "   -fast           the same as -static_mpi + pass -fast option to a compiler"
+        echo "   -O              enable optimization"
+        echo "   -link_mpi=<name>"
+        echo "                   link against the specified version of the Intel(R) MPI Library"
+        echo "                   i.e -link_mpi=opt|opt_mt|dbg|dbg_mt"
+        echo "   -norpath        disable rpath for compiler wrapper of the Intel(R) MPI Library"
+        echo "All other options will be passed to the compiler without changing."
+        echo "----------------------------------------------------------------------------"
+        echo "The following environment variables are used:"
+        echo "   I_MPI_ROOT      the Intel(R) MPI Library installation directory path"
+        echo "   I_MPI_CC or MPICH_CC"
+        echo "                   the path/name of the underlying compiler to be used"
+        echo "   I_MPI_CC_PROFILE or MPICC_PROFILE"
+        echo "                   the name of profile file (without extension)"
+        echo "   I_MPI_COMPILER_CONFIG_DIR"
+        echo "                   the folder which contains configuration files *.conf"
+        echo "   I_MPI_TRACE_PROFILE"
+        echo "                   specify a default profile for the -trace option"
+        echo "   I_MPI_CHECK_PROFILE"
+        echo "                   specify a default profile for the -check_mpi option"
+        echo "   I_MPI_LINK      specify the version of the Intel(R) MPI Library"
+        echo "   I_MPI_DEBUG_INFO_STRIP"
+        echo "                   turn on/off the debug information stripping during static linking"
+        echo "----------------------------------------------------------------------------"
+        exit 0
+        ;;
+    -nolinkage)
+        # This internal option is used by wrapper driver scripts mpicc, mpicxx, mpifc when -v option is used.
+        linking=no
+        addarg=no
+        ;;
+    -g)
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    -static_mpi)
+        static_mpi=yes
+        CFLAGS="$CFLAGS -Xlinker --export-dynamic"
+        addarg=no
+        ;;
+    -static)
+        static_mpi=yes
+        CFLAGS="$CFLAGS -Xlinker --export-dynamic"
+        addarg=yes
+        ;;
+    -mt_mpi)
+        addarg=no
+        ;;
+    -ilp64)
+        ilp64=yes
+        addarg=no
+        ;;
+    -check_mpi)
+        if [ -z "$profConf" ]; then
+            if [ -z "$I_MPI_CHECK_PROFILE" ]; then
+                profConf="vtmc"
+            else
+                profConf="$I_MPI_CHECK_PROFILE"
+            fi
+        else
+            echo "Warning: the -check_mpi option will be ignored because the profile was set."
+        fi
+        addarg=no
+        ;;
+    -trace-imbalance)
+        if [ -z "$profConf" ]; then
+            profConf="vtim"
+        else
+            echo "Warning: the -trace-imbalance option will be ignored because the profile was set."
+        fi
+        addarg=no
+        ;;
+    -t | -trace | -t=* | -trace=* )
+        if [ -z "$profConf" ]; then
+            if [ -z "$I_MPI_TRACE_PROFILE" ]; then
+                profConf="vt"
+            else
+                profConf="$I_MPI_TRACE_PROFILE"
+            fi
+        else
+            echo "Warning: the -trace option will be ignored because the profile was set."
+        fi
+        # Disable strip to prevent debug symbols into separate dbg file in case of static linking IMPI-1493
+        strip_debug_info=no
+        addarg=no
+        ;;
+    -fast)
+        echo "Warning: the -fast option forces static linkage method for the Intel(R) MPI Library."
+        static_mpi=yes
+        CFLAGS="$CFLAGS -Xlinker --export-dynamic"
+        ;;
+    -link_mpi=* )
+        mpilib_override=`echo A$arg | sed -e 's/A-link_mpi=//g'`
+        addarg=no
+        ;;
+    -nostrip )
+        strip_debug_info=no
+        addarg=no
+        ;;
+    -norpath )
+        no_rpath=yes
+        addarg=no
+        ;;
+    # Other arguments.  We are careful to handle arguments with
+    # quotes (we try to quote all arguments in case they include
+    # any spaces)
+    *\"*)
+        qarg="'$arg'"
+        ;;
+    *\'*)
+        qarg=$(echo \"$arg\")
+        ;;
+    *)
+        qarg="'$arg'"
+        ;;
+    esac
+    if [ $addarg = yes ] ; then
+        allargs="$allargs $qarg"
+    fi
+done
+
+if [ $# -eq 0 ] ; then
+    echo "Error: Command line argument is needed!"
+    "$0" -help
+    exit 1
+fi
+
+if [ -n "$mpilib_override" ] ; then
+    case "$mpilib_override" in
+    opt ) 
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    opt_mt )
+        MPILIBDIR=${release_lib_dir}
+        ;;
+    dbg )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    dbg_mt )
+        MPILIBDIR=${debug_lib_dir}
+        ;;
+    * )
+        echo "Warning: incorrect library version specified. Automatically selected library will be used."
+        ;;
+    esac
+fi
+# -----------------------------------------------------------------------
+
+if [ "$static_mpi" = yes ] ; then
+    mpilibs="${libdir}/libmpifort.a ${libdir}${MPILIBDIR}/lib${MPILIBNAME}.a"
+    I_MPI_OTHERLIBS=""
+    MPI_OTHERLIBS=" -lrt -lpthread "
+    if [ "$ilp64" = yes ]; then
+        mpilibs="$libdir/libmpi_ilp64.a $mpilibs"
+    fi
+    if [ "x$strip_debug_info" = "x" ] ; then
+        strip_debug_info=yes
+    fi
+else
+    mpilibs="-lmpifort -l$MPILIBNAME"
+    I_MPI_OTHERLIBS=""
+    MPI_OTHERLIBS=" -lrt -lpthread "
+    if [ "$ilp64" = yes ]; then
+        mpilibs="-lmpi_ilp64 $mpilibs"
+    fi
+fi
+# Derived variables.  These are assembled from variables set from the
+# default, environment, configuration file (if any) and command-line
+# options (if any)
+
+#
+# Handle the case of a profile switch
+if [ -n "$profConf" ] ; then
+    profConffile=
+    if [ -s "$libdir/lib$profConf.a" ] || [ -s "$libdir/lib$profConf.so" ] ; then
+        mpilibs="-l$profConf $mpilibs"
+    elif [ -s "$sysconfdir/$profConf.conf" ] ; then
+        profConffile="$sysconfdir/$profConf.conf"
+    elif [ -s "$profConf.conf" ] ; then
+        profConffile="$profConf.conf"
+    else
+        echo "Profiling configuration file $profConf.conf not found in $sysconfdir"
+    fi
+    if [ -n "$profConffile" ] && [ -s "$profConffile" ] ; then
+        . $profConffile
+        if [ -n "$PROFILE_INCPATHS" ] ; then
+            CFLAGS="$PROFILE_INCPATHS $CFLAGS"
+            fi
+            if [ -n "$PROFILE_PRELIB" ] ; then
+                mpilibs="$PROFILE_PRELIB $mpilibs"
+            fi
+            if [ -n "$PROFILE_POSTLIB" ] ; then
+                mpilibs="$mpilibs $PROFILE_POSTLIB"
+            fi
+        fi
+fi
+
+# -----------------------------------------------------------------------
+#
+# A temporary statement to invoke the compiler
+# Place the -L before any args incase there are any mpi libraries in there.
+# Eventually, we'll want to move this after any non-MPI implementation
+# libraries.
+# We use a single invocation of the compiler.  This will be adequate until
+# we run into a system that uses a separate linking command.  With any luck,
+# such archaic systems are no longer with us.  This also lets us
+# accept any argument; we don't need to know if we've seen a source
+# file or an object file.  Instead, we just check for an option that
+# suppressing linking, such as -c or -M.
+
+if [ "${show_env}" = "yes" ]; then
+    env | more
+    exit 0
+fi
+
+if [ "$no_rpath" = "yes" ]; then
+    rpath_opt="-Xlinker --enable-new-dtags"
+else
+    rpath_opt="-Xlinker --enable-new-dtags -Xlinker -rpath -Xlinker \"${libdir}${MPILIBDIR}\" -Xlinker -rpath -Xlinker \"${libdir}\""
+fi
+if [ "$linking" = yes ] ; then
+    cmd_line="$CC $CFLAGS $allargs -I\"${includedir}\" -L\"${libdir}${MPILIBDIR}\" -L\"${libdir}\" $rpath_opt $mpilibs $I_MPI_OTHERLIBS $LDFLAGS $MPI_OTHERLIBS"
+    if [ "$Show" = echo ] ; then
+        echo $cmd_line
+    else
+        eval $(echo $cmd_line)
+    fi
+    rc=$?
+    if [ $rc -eq 0 ] && [ "x$strip_debug_info" = "xyes" ] ; then
+        $Show objcopy --only-keep-debug ${executable} ${executable}.dbg
+        $Show objcopy --strip-debug ${executable}
+        $Show objcopy --add-gnu-debuglink=${executable}.dbg ${executable}
+    fi
+else
+    cmd_line="$CC $CFLAGS $allargs -I\"${includedir}\""
+    if [ "$Show" = echo ] ; then
+        echo "$cmd_line"
+    else
+        eval $(echo $cmd_line)
+    fi
+    rc=$?
+fi
+
+exit $rc
diff --git a/deps/mpi/bin/mpirun b/deps/mpi/bin/mpirun
index 86628a81d..131f08f5d 100755
--- a/deps/mpi/bin/mpirun
+++ b/deps/mpi/bin/mpirun
@@ -30,6 +30,8 @@ if [ -z "$I_MPI_ROOT" -a -z "$(uname -m | grep 1om)" ] ; then
         . ${0%/*}/mpivars.sh ""
     elif [ -f ${0%/*}/../env/vars.sh ]; then
         . ${0%/*}/../env/vars.sh ""
+    elif [ -f ${0%/*}/../setvars.sh ]; then
+        . ${0%/*}/../setvars.sh ""
     fi # else it can be a runtime package without any scripts to source
 fi
 
diff --git a/deps/mpi/include/mpi.h b/deps/mpi/include/mpi.h
index fb8079d7f..3b2004544 100644
--- a/deps/mpi/include/mpi.h
+++ b/deps/mpi/include/mpi.h
@@ -82,6 +82,13 @@
 #define MPICH_API_PUBLIC
 #endif
 
+
+#if defined(__SYCL_DEVICE_ONLY__)
+#define IMPI_DEVICE_EXPORT SYCL_EXTERNAL
+#else
+#define IMPI_DEVICE_EXPORT
+#endif
+
 /* Keep C++ compilers from getting confused */
 #if defined(__cplusplus)
 extern "C" {
@@ -354,6 +361,10 @@ typedef int MPI_Group;
 typedef int MPI_Win;
 #define MPI_WIN_NULL ((MPI_Win)0x20000000)
 
+/* for session */
+typedef int MPI_Session;
+#define MPI_SESSION_NULL     ((MPI_Session)0x38000000)
+
 /* File and IO */
 /* This define lets ROMIO know that MPI_File has been defined */
 #define MPI_FILE_DEFINED
@@ -419,6 +430,8 @@ static const MPI_Datatype mpich_mpi_datatype_null MPICH_ATTR_TYPE_TAG_MUST_BE_NU
 #define MPI_MAX_ERROR_STRING   512
 #define MPI_MAX_PORT_NAME      256
 #define MPI_MAX_OBJECT_NAME    128
+#define MPI_MAX_STRINGTAG_LEN  256
+#define MPI_MAX_PSET_NAME_LEN  256
 
 /* Pre-defined constants */
 #define MPI_UNDEFINED      (-32766)
@@ -472,11 +485,12 @@ typedef int (MPI_Win_delete_attr_function)(MPI_Win, int, void *, void *);
 typedef void (MPI_Comm_errhandler_function)(MPI_Comm *, int *, ...);
 typedef void (MPI_File_errhandler_function)(MPI_File *, int *, ...);
 typedef void (MPI_Win_errhandler_function)(MPI_Win *, int *, ...);
+typedef void (MPI_Session_errhandler_function)(MPI_Session *, int *, ...);
 /* names that were added in MPI-2.0 and deprecated in MPI-2.2 */
 typedef MPI_Comm_errhandler_function MPI_Comm_errhandler_fn;
 typedef MPI_File_errhandler_function MPI_File_errhandler_fn;
 typedef MPI_Win_errhandler_function MPI_Win_errhandler_fn;
-
+typedef MPI_Session_errhandler_function MPI_Session_errhandler_fn;
 /* Built in (0x1 in 30-31), errhandler (0x5 in bits 26-29, allkind (0
    in 22-25), index in the low bits */
 #define MPI_ERRORS_ARE_FATAL ((MPI_Errhandler)0x54000000)
@@ -486,6 +500,7 @@ typedef MPI_Win_errhandler_function MPI_Win_errhandler_fn;
    Using the MPIR prefix preserved the MPI_ names for objects defined by
    the standard. */
 #define MPIR_ERRORS_THROW_EXCEPTIONS ((MPI_Errhandler)0x54000002)
+#define MPI_ERRORS_ABORT     ((MPI_Errhandler)0x54000003)
 typedef int MPI_Errhandler;
 
 /* Make the C names for the dup function mixed case.
@@ -584,8 +599,8 @@ typedef int (MPI_Delete_function) ( MPI_Comm, int, void *, void * );
  * digits for REV, 1 digit for EXT and 2 digits for EXT_NUMBER. So,
  * 2019.0.0b0 will have the numeric version 20190000100.
  */
-#define I_MPI_VERSION "2021.10.0"
-#define I_MPI_NUMVERSION 20211000300
+#define I_MPI_VERSION "2021.11.0"
+#define I_MPI_NUMVERSION 20211100300
 
 /* for the datatype decoders */
 enum MPIR_Combiner_enum {
@@ -640,6 +655,8 @@ typedef int MPI_Info;
 /* MPICH-specific types */
 #define MPIX_COMM_TYPE_NEIGHBORHOOD 2
 
+#define MPI_COMM_TYPE_HW_GUIDED    3
+
 /* Definitions that are determined by configure. */
 typedef long MPI_Aint;
 typedef int MPI_Fint;
@@ -792,6 +809,8 @@ typedef enum MPIR_T_pvar_class_t {
 #define MPI_Win_f2c(win)   (MPI_Win)(win)
 #define MPI_Message_c2f(msg) ((MPI_Fint)(msg))
 #define MPI_Message_f2c(msg) ((MPI_Message)(msg))
+#define MPI_Session_c2f(session) (MPI_Fint)(session)
+#define MPI_Session_f2c(session) (MPI_Session)(session)
 
 /* PMPI versions of the handle transfer functions.  See section 4.17 */
 #define PMPI_Comm_c2f(comm) (MPI_Fint)(comm)
@@ -812,6 +831,8 @@ typedef enum MPIR_T_pvar_class_t {
 #define PMPI_Win_f2c(win)   (MPI_Win)(win)
 #define PMPI_Message_c2f(msg) ((MPI_Fint)(msg))
 #define PMPI_Message_f2c(msg) ((MPI_Message)(msg))
+#define PMPI_Session_c2f(session) (MPI_Fint)(session)
+#define PMPI_Session_f2c(session) (MPI_Session)(session)
 
 #define MPI_STATUS_IGNORE (MPI_Status *)1
 #define MPI_STATUSES_IGNORE (MPI_Status *)1
@@ -964,10 +985,11 @@ typedef int (MPIX_Grequest_wait_function)(int, void **, double, MPI_Status *);
 #define MPI_T_ERR_INVALID_NAME      73  /* Name doesn't match */
 #define MPI_T_ERR_INVALID           74  /* Generic error code for MPI_T added in MPI-3.1 */
 
+#define MPI_ERR_SESSION            75  /* Invalid session handle */
 
 #define MPI_ERR_LASTCODE    0x3fffffff  /* Last valid error code for a 
 					   predefined error class */
-#define MPICH_ERR_LAST_CLASS 74     /* It is also helpful to know the
+#define MPICH_ERR_LAST_CLASS 75     /* It is also helpful to know the
 				       last valid class */
 
 #define MPICH_ERR_FIRST_MPIX 100 /* Define a gap here because sock is
@@ -1013,6 +1035,20 @@ int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int ta
 int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
              MPI_Comm comm, MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int MPI_Get_count(const MPI_Status *status, MPI_Datatype datatype, int *count) MPICH_API_PUBLIC;
+int MPI_Comm_create_from_group(MPI_Group group, const char *stringtag, MPI_Info info,
+                               MPI_Errhandler errhandler, MPI_Comm *newcomm) MPICH_API_PUBLIC;
+int MPI_Group_from_session_pset(MPI_Session session, const char *pset_name, MPI_Group *newgroup)
+    MPICH_API_PUBLIC;
+int MPI_Session_finalize(MPI_Session *session) MPICH_API_PUBLIC;
+int MPI_Session_get_info(MPI_Session session, MPI_Info *info_used) MPICH_API_PUBLIC;
+int MPI_Session_get_nth_pset(MPI_Session session, MPI_Info info, int n, int *pset_len,
+                             char *pset_name) MPICH_API_PUBLIC;
+int MPI_Session_get_num_psets(MPI_Session session, MPI_Info info, int *npset_names)
+    MPICH_API_PUBLIC;
+int MPI_Session_get_pset_info(MPI_Session session, const char *pset_name, MPI_Info *info)
+    MPICH_API_PUBLIC;
+int MPI_Session_init(MPI_Info info, MPI_Errhandler errhandler, MPI_Session *session)
+    MPICH_API_PUBLIC;
 int MPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
               MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int MPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
@@ -1244,23 +1280,23 @@ int MPI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origi
                    int target_rank, MPI_Aint target_disp, int target_count,
                    MPI_Datatype target_datatype, MPI_Op op, MPI_Win win)
                    MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+IMPI_DEVICE_EXPORT int MPI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
             int target_rank, MPI_Aint target_disp, int target_count,
             MPI_Datatype target_datatype, MPI_Win win) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
-int MPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
+IMPI_DEVICE_EXPORT int MPI_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype,
             int target_rank, MPI_Aint target_disp, int target_count,
             MPI_Datatype target_datatype, MPI_Win win) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int MPI_Win_complete(MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_create(void *base, MPI_Aint size, int disp_unit, MPI_Info info, MPI_Comm comm,
                    MPI_Win *win) MPICH_API_PUBLIC;
-int MPI_Win_fence(int assert, MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_fence(int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_free(MPI_Win *win) MPICH_API_PUBLIC;
 int MPI_Win_get_group(MPI_Win win, MPI_Group *group) MPICH_API_PUBLIC;
-int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_lock(int lock_type, int rank, int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_post(MPI_Group group, int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_start(MPI_Group group, int assert, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_test(MPI_Win win, int *flag) MPICH_API_PUBLIC;
-int MPI_Win_unlock(int rank, MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_unlock(int rank, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_wait(MPI_Win win) MPICH_API_PUBLIC;
 
 /* MPI-3 One-Sided Communication Routines */
@@ -1312,10 +1348,10 @@ int MPI_Rget_accumulate(const void *origin_addr, int origin_count,
                          MPI_Request *request)
                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3)
                          MPICH_ATTR_POINTER_WITH_TYPE_TAG(4,6) MPICH_API_PUBLIC;
-int MPI_Win_lock_all(int assert, MPI_Win win) MPICH_API_PUBLIC;
-int MPI_Win_unlock_all(MPI_Win win) MPICH_API_PUBLIC;
-int MPI_Win_flush(int rank, MPI_Win win) MPICH_API_PUBLIC;
-int MPI_Win_flush_all(MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_lock_all(int assert, MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_unlock_all(MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_flush(int rank, MPI_Win win) MPICH_API_PUBLIC;
+IMPI_DEVICE_EXPORT int MPI_Win_flush_all(MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_flush_local(int rank, MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_flush_local_all(MPI_Win win) MPICH_API_PUBLIC;
 int MPI_Win_sync(MPI_Win win) MPICH_API_PUBLIC;
@@ -1380,6 +1416,11 @@ int MPI_File_create_errhandler(MPI_File_errhandler_function *file_errhandler_fn,
                                MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
 int MPI_File_get_errhandler(MPI_File file, MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
 int MPI_File_set_errhandler(MPI_File file, MPI_Errhandler errhandler) MPICH_API_PUBLIC;
+int MPI_Session_call_errhandler(MPI_Session session, int errorcode) MPICH_API_PUBLIC;
+int MPI_Session_create_errhandler(MPI_Session_errhandler_function *session_errhandler_fn,
+                                  MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
+int MPI_Session_get_errhandler(MPI_Session session, MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
+int MPI_Session_set_errhandler(MPI_Session session, MPI_Errhandler errhandler) MPICH_API_PUBLIC;
 int MPI_Finalized(int *flag) MPICH_API_PUBLIC;
 int MPI_Free_mem(void *base) MPICH_API_PUBLIC;
 int MPI_Get_address(const void *location, MPI_Aint *address) MPICH_API_PUBLIC;
@@ -1658,6 +1699,20 @@ int PMPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int t
 int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag,
               MPI_Comm comm, MPI_Status *status) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int PMPI_Get_count(const MPI_Status *status, MPI_Datatype datatype, int *count) MPICH_API_PUBLIC;
+int PMPI_Comm_create_from_group(MPI_Group group, const char *stringtag, MPI_Info info,
+                                MPI_Errhandler errhandler, MPI_Comm *newcomm) MPICH_API_PUBLIC;
+int PMPI_Group_from_session_pset(MPI_Session session, const char *pset_name, MPI_Group *newgroup)
+    MPICH_API_PUBLIC;
+int PMPI_Session_finalize(MPI_Session *session) MPICH_API_PUBLIC;
+int PMPI_Session_get_info(MPI_Session session, MPI_Info *info_used) MPICH_API_PUBLIC;
+int PMPI_Session_get_nth_pset(MPI_Session session, MPI_Info info, int n, int *pset_len,
+                              char *pset_name) MPICH_API_PUBLIC;
+int PMPI_Session_get_num_psets(MPI_Session session, MPI_Info info, int *npset_names)
+    MPICH_API_PUBLIC;
+int PMPI_Session_get_pset_info(MPI_Session session, const char *pset_name, MPI_Info *info)
+    MPICH_API_PUBLIC;
+int PMPI_Session_init(MPI_Info info, MPI_Errhandler errhandler, MPI_Session *session)
+    MPICH_API_PUBLIC;
 int PMPI_Bsend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
                MPI_Comm comm) MPICH_ATTR_POINTER_WITH_TYPE_TAG(1,3) MPICH_API_PUBLIC;
 int PMPI_Ssend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag,
@@ -2024,6 +2079,11 @@ int PMPI_File_create_errhandler(MPI_File_errhandler_function *file_errhandler_fn
                                 MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
 int PMPI_File_get_errhandler(MPI_File file, MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
 int PMPI_File_set_errhandler(MPI_File file, MPI_Errhandler errhandler) MPICH_API_PUBLIC;
+int PMPI_Session_call_errhandler(MPI_Session session, int errorcode) MPICH_API_PUBLIC;
+int PMPI_Session_create_errhandler(MPI_Session_errhandler_function *session_errhandler_fn,
+                                   MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
+int PMPI_Session_get_errhandler(MPI_Session session, MPI_Errhandler *errhandler) MPICH_API_PUBLIC;
+int PMPI_Session_set_errhandler(MPI_Session session, MPI_Errhandler errhandler) MPICH_API_PUBLIC;
 int PMPI_Finalized(int *flag) MPICH_API_PUBLIC;
 int PMPI_Free_mem(void *base) MPICH_API_PUBLIC;
 int PMPI_Get_address(const void *location, MPI_Aint *address) MPICH_API_PUBLIC;
@@ -2798,7 +2858,6 @@ int PMPI_Ssend_init_c(const void *buf, MPI_Count count, MPI_Datatype datatype, i
 #endif 
 #endif
 
-
 /* Generalized requests extensions */
 typedef int MPIX_Grequest_class;
 int MPIX_Grequest_class_create(MPI_Grequest_query_function *query_fn,
diff --git a/deps/mpi/lib/libmpi.so b/deps/mpi/lib/libmpi.so
index bb5558da8..a728c1a42 100755
Binary files a/deps/mpi/lib/libmpi.so and b/deps/mpi/lib/libmpi.so differ
diff --git a/deps/mpi/lib/libmpi.so.12 b/deps/mpi/lib/libmpi.so.12
index bb5558da8..a728c1a42 100755
Binary files a/deps/mpi/lib/libmpi.so.12 and b/deps/mpi/lib/libmpi.so.12 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0 b/deps/mpi/lib/libmpi.so.12.0
index bb5558da8..a728c1a42 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0 and b/deps/mpi/lib/libmpi.so.12.0 differ
diff --git a/deps/mpi/lib/libmpi.so.12.0.0 b/deps/mpi/lib/libmpi.so.12.0.0
index bb5558da8..a728c1a42 100755
Binary files a/deps/mpi/lib/libmpi.so.12.0.0 and b/deps/mpi/lib/libmpi.so.12.0.0 differ
diff --git a/deps/mpi/lib/libmpifort.so b/deps/mpi/lib/libmpifort.so
index 63c5627aa..26a93dc35 100755
Binary files a/deps/mpi/lib/libmpifort.so and b/deps/mpi/lib/libmpifort.so differ
diff --git a/deps/mpi/lib/libmpifort.so.12 b/deps/mpi/lib/libmpifort.so.12
index 63c5627aa..26a93dc35 100755
Binary files a/deps/mpi/lib/libmpifort.so.12 and b/deps/mpi/lib/libmpifort.so.12 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0 b/deps/mpi/lib/libmpifort.so.12.0
index 63c5627aa..26a93dc35 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0 and b/deps/mpi/lib/libmpifort.so.12.0 differ
diff --git a/deps/mpi/lib/libmpifort.so.12.0.0 b/deps/mpi/lib/libmpifort.so.12.0.0
index 63c5627aa..26a93dc35 100755
Binary files a/deps/mpi/lib/libmpifort.so.12.0.0 and b/deps/mpi/lib/libmpifort.so.12.0.0 differ
diff --git a/deps/mpi/licensing/third-party-programs.txt b/deps/mpi/licensing/third-party-programs.txt
index 187bc34e5..78202fa91 100644
--- a/deps/mpi/licensing/third-party-programs.txt
+++ b/deps/mpi/licensing/third-party-programs.txt
@@ -1,4 +1,4 @@
-Intel(R) MPI Library 2021.10 Third Party Programs File
+Intel(R) MPI Library 2021.11 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
@@ -704,8 +704,8 @@ SOFTWARE.
  
   The following third party programs have their own third party programs. These
   additional third party program files are as follows:
-  1. Intel(R) MPI Benchmarks <install_dir>\mpi\latest\benchmarks\imb\license\third-party-programs.txt
-  2. Intel(R) Distribution for Python* <install_dir>\intelpython\latest\licensing\third-party-programs.txt
+  1. Intel(R) MPI Benchmarks <install_dir>/mpi/latest/opt/mpi/benchmarks/imb/license/third-party-programs.txt
+  2. Intel(R) Distribution for Python* <install_dir>/intelpython/latest/licensing/third-party-programs.txt
   
 -------------------------------------------------------------------------------
   
diff --git a/deps/mpi/etc/tuning_clx-ap_ofi.dat b/deps/mpi/opt/mpi/etc/tuning_clx-ap_ofi.dat
similarity index 100%
rename from deps/mpi/etc/tuning_clx-ap_ofi.dat
rename to deps/mpi/opt/mpi/etc/tuning_clx-ap_ofi.dat
diff --git a/deps/mpi/etc/tuning_clx-ap_shm-ofi.dat b/deps/mpi/opt/mpi/etc/tuning_clx-ap_shm-ofi.dat
similarity index 100%
rename from deps/mpi/etc/tuning_clx-ap_shm-ofi.dat
rename to deps/mpi/opt/mpi/etc/tuning_clx-ap_shm-ofi.dat
diff --git a/deps/mpi/etc/tuning_clx-ap_shm.dat b/deps/mpi/opt/mpi/etc/tuning_clx-ap_shm.dat
similarity index 100%
rename from deps/mpi/etc/tuning_clx-ap_shm.dat
rename to deps/mpi/opt/mpi/etc/tuning_clx-ap_shm.dat
diff --git a/deps/mpi/etc/tuning_generic_ofi.dat b/deps/mpi/opt/mpi/etc/tuning_generic_ofi.dat
similarity index 100%
rename from deps/mpi/etc/tuning_generic_ofi.dat
rename to deps/mpi/opt/mpi/etc/tuning_generic_ofi.dat
diff --git a/deps/mpi/etc/tuning_generic_shm-ofi.dat b/deps/mpi/opt/mpi/etc/tuning_generic_shm-ofi.dat
similarity index 100%
rename from deps/mpi/etc/tuning_generic_shm-ofi.dat
rename to deps/mpi/opt/mpi/etc/tuning_generic_shm-ofi.dat
diff --git a/deps/mpi/etc/tuning_generic_shm.dat b/deps/mpi/opt/mpi/etc/tuning_generic_shm.dat
similarity index 100%
rename from deps/mpi/etc/tuning_generic_shm.dat
rename to deps/mpi/opt/mpi/etc/tuning_generic_shm.dat
diff --git a/deps/mpi/etc/tuning_skx_ofi.dat b/deps/mpi/opt/mpi/etc/tuning_skx_ofi.dat
similarity index 100%
rename from deps/mpi/etc/tuning_skx_ofi.dat
rename to deps/mpi/opt/mpi/etc/tuning_skx_ofi.dat
diff --git a/deps/mpi/etc/tuning_skx_shm-ofi.dat b/deps/mpi/opt/mpi/etc/tuning_skx_shm-ofi.dat
similarity index 100%
rename from deps/mpi/etc/tuning_skx_shm-ofi.dat
rename to deps/mpi/opt/mpi/etc/tuning_skx_shm-ofi.dat
diff --git a/deps/mpi/etc/tuning_skx_shm.dat b/deps/mpi/opt/mpi/etc/tuning_skx_shm.dat
similarity index 100%
rename from deps/mpi/etc/tuning_skx_shm.dat
rename to deps/mpi/opt/mpi/etc/tuning_skx_shm.dat
diff --git a/deps/ofi/bin/fi_info b/deps/ofi/bin/fi_info
index 89077a6fe..35dc60fe3 100755
Binary files a/deps/ofi/bin/fi_info and b/deps/ofi/bin/fi_info differ
diff --git a/deps/ofi/include/rdma/fabric.h b/deps/ofi/include/rdma/fabric.h
index f911526f2..0b9a4c157 100644
--- a/deps/ofi/include/rdma/fabric.h
+++ b/deps/ofi/include/rdma/fabric.h
@@ -84,7 +84,7 @@ extern "C" {
 #endif
 
 #define FI_MAJOR_VERSION 1
-#define FI_MINOR_VERSION 16
+#define FI_MINOR_VERSION 18
 #define FI_REVISION_VERSION 1
 
 enum {
@@ -165,9 +165,9 @@ typedef struct fid *fid_t;
 #define FI_COMMIT_COMPLETE	(1ULL << 30)
 #define FI_MATCH_COMPLETE	(1ULL << 31)
 
+#define FI_PEER_TRANSFER	(1ULL << 36)
 #define FI_AV_USER_ID		(1ULL << 41)
-#define FI_PEER_SRX		(1ULL << 42)
-#define FI_PEER_CQ		(1ULL << 43)
+#define FI_PEER			(1ULL << 43)
 #define FI_XPU_TRIGGER		(1ULL << 44)
 #define FI_HMEM_HOST_ALLOC	(1ULL << 45)
 #define FI_HMEM_DEVICE_ONLY	(1ULL << 46)
@@ -216,6 +216,7 @@ enum {
 	FI_ADDR_PSMX3,		/* uint64_t[4] */
 	FI_ADDR_OPX,
 	FI_ADDR_CXI,
+	FI_ADDR_UCX,
 };
 
 #define FI_ADDR_UNSPEC		((uint64_t) -1)
@@ -334,6 +335,8 @@ enum {
 	FI_PROTO_OPX,
 	FI_PROTO_CXI,
 	FI_PROTO_XNET,
+	FI_PROTO_COLL,
+	FI_PROTO_UCX,
 };
 
 enum {
@@ -368,6 +371,7 @@ static inline uint8_t fi_tc_dscp_get(uint32_t tclass)
 #define FI_RESTRICTED_COMP	(1ULL << 53)
 #define FI_CONTEXT2		(1ULL << 52)
 #define FI_BUFFERED_RECV	(1ULL << 51)
+/* #define FI_PEER_TRANSFER	(1ULL << 36) */
 
 struct fi_tx_attr {
 	uint64_t		caps;
@@ -536,6 +540,8 @@ enum {
 	FI_CLASS_PEER_CQ,
 	FI_CLASS_PEER_SRX,
 	FI_CLASS_LOG,
+	FI_CLASS_PEER_AV,
+	FI_CLASS_PEER_AV_SET,
 };
 
 struct fi_eq_attr;
@@ -586,6 +592,8 @@ struct fi_ops_fabric {
 			struct fid_wait **waitset);
 	int	(*trywait)(struct fid_fabric *fabric, struct fid **fids,
 			int count);
+	int	(*domain2)(struct fid_fabric *fabric, struct fi_info *info,
+			struct fid_domain **dom, uint64_t flags, void *context);
 };
 
 struct fid_fabric {
diff --git a/deps/ofi/include/rdma/fi_collective.h b/deps/ofi/include/rdma/fi_collective.h
index 41528b54f..9e7ac6297 100644
--- a/deps/ofi/include/rdma/fi_collective.h
+++ b/deps/ofi/include/rdma/fi_collective.h
@@ -92,8 +92,7 @@ struct fi_msg_collective {
 struct fi_ops_collective {
 	size_t	size;
 
-	ssize_t	(*barrier)(struct fid_ep *ep, fi_addr_t coll_addr,
-			void *context);
+	ssize_t	(*barrier)(struct fid_ep *ep, fi_addr_t coll_addr, void *context);
 	ssize_t	(*broadcast)(struct fid_ep *ep,
 			void *buf, size_t count, void *desc,
 			fi_addr_t coll_addr, fi_addr_t root_addr,
@@ -135,6 +134,8 @@ struct fi_ops_collective {
 			const struct fi_msg_collective *msg,
 			struct fi_ioc *resultv, void **result_desc,
 			size_t result_count, uint64_t flags);
+	ssize_t	(*barrier2)(struct fid_ep *ep, fi_addr_t coll_addr, uint64_t flags,
+			void *context);
 };
 
 
@@ -206,6 +207,17 @@ fi_barrier(struct fid_ep *ep, fi_addr_t coll_addr, void *context)
 	return ep->collective->barrier(ep, coll_addr, context);
 }
 
+static inline ssize_t
+fi_barrier2(struct fid_ep *ep, fi_addr_t coll_addr, uint64_t flags, void *context)
+{
+	if (!flags)
+		return fi_barrier(ep, coll_addr, context);
+
+	return FI_CHECK_OP(ep->collective, struct fi_ops_collective, barrier2) ?
+		ep->collective->barrier2(ep, coll_addr, flags, context) :
+		-FI_ENOSYS;
+}
+
 static inline ssize_t
 fi_broadcast(struct fid_ep *ep, void *buf, size_t count, void *desc,
 	     fi_addr_t coll_addr, fi_addr_t root_addr,
diff --git a/deps/ofi/include/rdma/fi_domain.h b/deps/ofi/include/rdma/fi_domain.h
index 05b30b831..6a3e4b7d8 100644
--- a/deps/ofi/include/rdma/fi_domain.h
+++ b/deps/ofi/include/rdma/fi_domain.h
@@ -130,6 +130,11 @@ enum fi_hmem_iface {
 	FI_HMEM_SYNAPSEAI,
 };
 
+static inline int fi_hmem_ze_device(int driver_index, int device_index)
+{
+	return driver_index << 16 | device_index;
+}
+
 struct fi_mr_attr {
 	const struct iovec	*mr_iov;
 	size_t			iov_count;
@@ -285,6 +290,8 @@ struct fi_ops_domain {
 	int	(*query_collective)(struct fid_domain *domain,
 			enum fi_collective_op coll,
 			struct fi_collective_attr *attr, uint64_t flags);
+	int	(*endpoint2)(struct fid_domain *domain, struct fi_info *info,
+			struct fid_ep **ep, uint64_t flags, void *context);
 };
 
 /* Memory registration flags */
@@ -326,6 +333,18 @@ fi_domain(struct fid_fabric *fabric, struct fi_info *info,
 	return fabric->ops->domain(fabric, info, domain, context);
 }
 
+static inline int
+fi_domain2(struct fid_fabric *fabric, struct fi_info *info,
+	   struct fid_domain **domain, uint64_t flags, void *context)
+{
+	if (!flags)
+		return fi_domain(fabric, info, domain, context);
+
+	return FI_CHECK_OP(fabric->ops, struct fi_ops_fabric, domain2) ?
+		fabric->ops->domain2(fabric, info, domain, flags, context) :
+		-FI_ENOSYS;
+}
+
 static inline int
 fi_domain_bind(struct fid_domain *domain, struct fid *fid, uint64_t flags)
 {
diff --git a/deps/ofi/include/rdma/fi_endpoint.h b/deps/ofi/include/rdma/fi_endpoint.h
index 56df151c7..cf0611b1b 100644
--- a/deps/ofi/include/rdma/fi_endpoint.h
+++ b/deps/ofi/include/rdma/fi_endpoint.h
@@ -68,6 +68,7 @@ enum {
 	FI_OPT_RX_SIZE,
 	FI_OPT_FI_HMEM_P2P,		/* int */
 	FI_OPT_XPU_TRIGGER,		/* struct fi_trigger_xpu */
+	FI_OPT_CUDA_API_PERMITTED,	/* bool */
 };
 
 /*
@@ -177,6 +178,18 @@ fi_endpoint(struct fid_domain *domain, struct fi_info *info,
 	return domain->ops->endpoint(domain, info, ep, context);
 }
 
+static inline int
+fi_endpoint2(struct fid_domain *domain, struct fi_info *info,
+	     struct fid_ep **ep, uint64_t flags, void *context)
+{
+	if (!flags)
+		return fi_endpoint(domain, info, ep, context);
+
+	return FI_CHECK_OP(domain->ops, struct fi_ops_domain, endpoint2) ?
+		domain->ops->endpoint2(domain, info, ep, flags, context) :
+		-FI_ENOSYS;
+}
+
 static inline int
 fi_scalable_ep(struct fid_domain *domain, struct fi_info *info,
 	    struct fid_ep **sep, void *context)
diff --git a/deps/ofi/include/rdma/fi_ext.h b/deps/ofi/include/rdma/fi_ext.h
index 412007c16..a17288dae 100644
--- a/deps/ofi/include/rdma/fi_ext.h
+++ b/deps/ofi/include/rdma/fi_ext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021-2023 Intel Corporation. All rights reserved.
  * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
  * Copyright (c) 2022 DataDirect Networks, Inc. All rights reserved.
  *
@@ -69,7 +69,13 @@ extern "C" {
 
 /* negative options are provider specific */
 enum {
-       FI_OPT_EFA_RNR_RETRY = -FI_PROV_SPECIFIC_EFA,
+	FI_OPT_EFA_RNR_RETRY = -FI_PROV_SPECIFIC_EFA,
+	FI_OPT_EFA_EMULATED_READ,       /* bool */
+	FI_OPT_EFA_EMULATED_WRITE,      /* bool */
+	FI_OPT_EFA_EMULATED_ATOMICS,    /* bool */
+	FI_OPT_EFA_USE_DEVICE_RDMA,	/* bool */
+	FI_OPT_EFA_SENDRECV_IN_ORDER_ALIGNED_128_BYTES, /* bool */
+	FI_OPT_EFA_WRITE_IN_ORDER_ALIGNED_128_BYTES, /* bool */
 };
 
 struct fi_fid_export {
@@ -129,80 +135,6 @@ struct fid_mem_monitor {
 };
 
 
-/* Peer provider CQ support. */
-struct fid_peer_cq;
-
-struct fi_ops_cq_owner {
-	size_t	size;
-	ssize_t (*write)(struct fid_peer_cq *cq, void *context, uint64_t flags,
-			size_t len, void *buf, uint64_t data, uint64_t tag,
-			fi_addr_t src);
-	ssize_t	(*writeerr)(struct fid_peer_cq *cq,
-			const struct fi_cq_err_entry *err_entry);
-};
-
-struct fid_peer_cq {
-	struct fid fid;
-	struct fi_ops_cq_owner *owner_ops;
-};
-
-struct fi_peer_cq_context {
-	size_t size;
-	struct fid_peer_cq *cq;
-};
-
-
-/* Peer shared rx context */
-struct fid_peer_srx;
-
-/* Castable to dlist_entry */
-struct fi_peer_rx_entry {
-	struct fi_peer_rx_entry *next;
-	struct fi_peer_rx_entry *prev;
-	struct fid_peer_srx *srx;
-	fi_addr_t addr;
-	size_t size;
-	uint64_t tag;
-	uint64_t flags;
-	void *context;
-	size_t count;
-	void **desc;
-	void *peer_context;
-	void *owner_context;
-	struct iovec *iov;
-};
-
-struct fi_ops_srx_owner {
-	size_t	size;
-	int	(*get_msg)(struct fid_peer_srx *srx, fi_addr_t addr,
-			size_t size, struct fi_peer_rx_entry **entry);
-	int	(*get_tag)(struct fid_peer_srx *srx, fi_addr_t addr,
-			uint64_t tag, struct fi_peer_rx_entry **entry);
-	int	(*queue_msg)(struct fi_peer_rx_entry *entry);
-	int	(*queue_tag)(struct fi_peer_rx_entry *entry);
-
-	void	(*free_entry)(struct fi_peer_rx_entry *entry);
-};
-
-struct fi_ops_srx_peer {
-	size_t	size;
-	int	(*start_msg)(struct fi_peer_rx_entry *entry);
-	int	(*start_tag)(struct fi_peer_rx_entry *entry);
-	int	(*discard_msg)(struct fi_peer_rx_entry *entry);
-	int	(*discard_tag)(struct fi_peer_rx_entry *entry);
-};
-
-struct fid_peer_srx {
-	struct fid_ep ep_fid;
-	struct fi_ops_srx_owner *owner_ops;
-	struct fi_ops_srx_peer *peer_ops;
-};
-
-struct fi_peer_srx_context {
-	size_t size;
-	struct fid_peer_srx *srx;
-};
-
 /*
  * System logging import extension:
  * To use, open logging fid and import.
@@ -248,7 +180,8 @@ static inline int fi_import_log(uint32_t version, uint64_t flags,
 	log_fid->fid.fclass = FI_CLASS_LOG;
 	log_fid->ops->size = sizeof(struct fi_ops_log);
 
-	return fi_import(version, "logging", NULL, 0, flags, &log_fid->fid, log_fid);
+	return fi_import(version, "logging", NULL, 0, flags, &log_fid->fid,
+			 log_fid);
 }
 
 #ifdef __cplusplus
diff --git a/deps/ofi/include/rdma/providers/fi_peer.h b/deps/ofi/include/rdma/providers/fi_peer.h
new file mode 100644
index 000000000..be4e77513
--- /dev/null
+++ b/deps/ofi/include/rdma/providers/fi_peer.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2021-2023 Intel Corporation. All rights reserved.
+ * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2022 DataDirect Networks, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef FI_PEER_H
+#define FI_PEER_H
+
+#include <stdbool.h>
+#include <rdma/fabric.h>
+#include <rdma/fi_eq.h>
+#include <rdma/fi_endpoint.h>
+#include <rdma/providers/fi_prov.h>
+#include <rdma/providers/fi_log.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Peer provider AV support.
+ */
+struct fid_peer_av;
+
+struct fi_ops_av_owner {
+	size_t	size;
+	int	(*query)(struct fid_peer_av *av, struct fi_av_attr *attr);
+	fi_addr_t (*ep_addr)(struct fid_peer_av *av, struct fid_ep *ep);
+};
+
+struct fid_peer_av {
+	struct fid fid;
+	struct fi_ops_av_owner *owner_ops;
+};
+
+struct fi_peer_av_context {
+	size_t size;
+	struct fid_peer_av *av;
+};
+
+
+/*
+ * Peer provider AV set support.
+ */
+struct fid_peer_av_set;
+
+struct fi_ops_av_set_owner {
+	size_t	size;
+	int	(*members)(struct fid_peer_av_set *av, fi_addr_t *addr,
+			   size_t *count);
+};
+
+struct fid_peer_av_set {
+	struct fid fid;
+	struct fi_ops_av_set_owner *owner_ops;
+};
+
+struct fi_peer_av_set_context {
+	size_t size;
+	struct fi_peer_av_set *av_set;
+};
+
+
+/*
+ * Peer provider CQ support.
+ */
+struct fid_peer_cq;
+
+struct fi_ops_cq_owner {
+	size_t	size;
+	ssize_t (*write)(struct fid_peer_cq *cq, void *context, uint64_t flags,
+			size_t len, void *buf, uint64_t data, uint64_t tag,
+			fi_addr_t src);
+	ssize_t	(*writeerr)(struct fid_peer_cq *cq,
+			const struct fi_cq_err_entry *err_entry);
+};
+
+struct fid_peer_cq {
+	struct fid fid;
+	struct fi_ops_cq_owner *owner_ops;
+};
+
+struct fi_peer_cq_context {
+	size_t size;
+	struct fid_peer_cq *cq;
+};
+
+
+/*
+ * Peer provider domain support.
+ */
+struct fi_peer_domain_context {
+	size_t size;
+	struct fid_domain *domain;
+};
+
+
+/*
+ * Peer provider EQ support.
+ */
+struct fi_peer_eq_context {
+	size_t size;
+	struct fid_eq *eq;
+};
+
+
+/*
+ * Peer shared rx context
+ */
+struct fid_peer_srx;
+
+/* Castable to dlist_entry */
+struct fi_peer_rx_entry {
+	struct fi_peer_rx_entry *next;
+	struct fi_peer_rx_entry *prev;
+	struct fid_peer_srx *srx;
+	fi_addr_t addr;
+	size_t size;
+	uint64_t tag;
+	uint64_t flags;
+	void *context;
+	size_t count;
+	void **desc;
+	void *peer_context;
+	void *owner_context;
+	struct iovec *iov;
+};
+
+struct fi_ops_srx_owner {
+	size_t	size;
+	int	(*get_msg)(struct fid_peer_srx *srx, fi_addr_t addr,
+			size_t size, struct fi_peer_rx_entry **entry);
+	int	(*get_tag)(struct fid_peer_srx *srx, fi_addr_t addr,
+			uint64_t tag, struct fi_peer_rx_entry **entry);
+	int	(*queue_msg)(struct fi_peer_rx_entry *entry);
+	int	(*queue_tag)(struct fi_peer_rx_entry *entry);
+
+	void	(*free_entry)(struct fi_peer_rx_entry *entry);
+};
+
+struct fi_ops_srx_peer {
+	size_t	size;
+	int	(*start_msg)(struct fi_peer_rx_entry *entry);
+	int	(*start_tag)(struct fi_peer_rx_entry *entry);
+	int	(*discard_msg)(struct fi_peer_rx_entry *entry);
+	int	(*discard_tag)(struct fi_peer_rx_entry *entry);
+};
+
+struct fid_peer_srx {
+	struct fid_ep ep_fid;
+	struct fi_ops_srx_owner *owner_ops;
+	struct fi_ops_srx_peer *peer_ops;
+};
+
+struct fi_peer_srx_context {
+	size_t size;
+	struct fid_peer_srx *srx;
+};
+
+
+/*
+ * Peer transfers
+ */
+struct fi_peer_transfer_context;
+
+struct fi_ops_transfer_peer {
+	size_t size;
+	ssize_t	(*complete)(struct fid_ep *ep, struct fi_cq_tagged_entry *buf,
+			    fi_addr_t src_addr);
+	ssize_t	(*comperr)(struct fid_ep *ep, struct fi_cq_err_entry *buf);
+};
+
+struct fi_peer_transfer_context {
+	size_t size;
+	struct fi_info *info;
+	struct fid_ep *ep;
+	struct fi_ops_transfer_peer *peer_ops;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* FI_PEER_H */
diff --git a/deps/ofi/lib/libfabric.so b/deps/ofi/lib/libfabric.so
index 06c884e1a..193603003 100755
Binary files a/deps/ofi/lib/libfabric.so and b/deps/ofi/lib/libfabric.so differ
diff --git a/deps/ofi/lib/libfabric.so.1 b/deps/ofi/lib/libfabric.so.1
index 06c884e1a..193603003 100755
Binary files a/deps/ofi/lib/libfabric.so.1 and b/deps/ofi/lib/libfabric.so.1 differ
diff --git a/deps/ofi/lib/prov/libpsm3-fi.so b/deps/ofi/lib/prov/libpsm3-fi.so
index 0a937ac88..6dfdb8c17 100755
Binary files a/deps/ofi/lib/prov/libpsm3-fi.so and b/deps/ofi/lib/prov/libpsm3-fi.so differ
diff --git a/deps/ofi/lib/prov/libpsmx2-fi.so b/deps/ofi/lib/prov/libpsmx2-fi.so
index dfce6641f..d16692935 100755
Binary files a/deps/ofi/lib/prov/libpsmx2-fi.so and b/deps/ofi/lib/prov/libpsmx2-fi.so differ
diff --git a/deps/ofi/lib/prov/librxm-fi.so b/deps/ofi/lib/prov/librxm-fi.so
index a10783523..2018314bd 100755
Binary files a/deps/ofi/lib/prov/librxm-fi.so and b/deps/ofi/lib/prov/librxm-fi.so differ
diff --git a/deps/ofi/lib/prov/libshm-fi.so b/deps/ofi/lib/prov/libshm-fi.so
index b461c3f3e..7db1a1c78 100755
Binary files a/deps/ofi/lib/prov/libshm-fi.so and b/deps/ofi/lib/prov/libshm-fi.so differ
diff --git a/deps/ofi/lib/prov/libtcp-fi.so b/deps/ofi/lib/prov/libtcp-fi.so
index 89be5f0e8..bce823911 100755
Binary files a/deps/ofi/lib/prov/libtcp-fi.so and b/deps/ofi/lib/prov/libtcp-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-1.1-fi.so b/deps/ofi/lib/prov/libverbs-1.1-fi.so
old mode 100755
new mode 100644
index 14f00726c..2384ebc22
Binary files a/deps/ofi/lib/prov/libverbs-1.1-fi.so and b/deps/ofi/lib/prov/libverbs-1.1-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-1.12-fi.so b/deps/ofi/lib/prov/libverbs-1.12-fi.so
old mode 100755
new mode 100644
index 1998f3ecf..30f87d3be
Binary files a/deps/ofi/lib/prov/libverbs-1.12-fi.so and b/deps/ofi/lib/prov/libverbs-1.12-fi.so differ
diff --git a/deps/ofi/lib/prov/libverbs-fi.so b/deps/ofi/lib/prov/libverbs-fi.so
deleted file mode 100755
index 75003ff0f..000000000
Binary files a/deps/ofi/lib/prov/libverbs-fi.so and /dev/null differ
diff --git a/doc/rst/source/advanced-configuration/dmabuf.rst b/doc/rst/source/advanced-configuration/dmabuf.rst
index b944be120..4048a7f4d 100644
--- a/doc/rst/source/advanced-configuration/dmabuf.rst
+++ b/doc/rst/source/advanced-configuration/dmabuf.rst
@@ -54,20 +54,7 @@ OFI
 Run instructions
 ################
 
-1. Set the environment.
-
-   If |base_tk| is used:
-
-   ::
-
-       source <toolkit_install_dir>/setvars.sh
-
-   If software components are built from sources:
-
-   ::
-
-       source <ccl_install_dir>/env/setvars.sh
-       export LD_LIBRARY_PATH=<ofi_install_path>/lib:${LD_LIBRARY_PATH}
+1. Set the environment. See `Get Started Guide <https://www.intel.com/content/www/us/en/docs/oneccl/get-started-guide/2021-10/overview.html>`_.
 
 2. Run allreduce test with ring algorithm and SYCL USM device buffers:
 
diff --git a/doc/rst/source/api/operations.rst b/doc/rst/source/api/operations.rst
index 4cf2b12df..45ea2cdbe 100644
--- a/doc/rst/source/api/operations.rst
+++ b/doc/rst/source/api/operations.rst
@@ -1,6 +1,6 @@
 .. _`communication operations`: https://spec.oneapi.com/versions/latest/elements/oneCCL/source/spec/operations.html
 
-Communication operations
+Communication Operations
 ========================
 
 Refer to |product_short| specification for more details about `communication operations`.
@@ -10,3 +10,4 @@ Refer to |product_short| specification for more details about `communication ope
    
    operations/datatypes.rst
    operations/collective-operations.rst
+   operations/point-to-point.rst
diff --git a/doc/rst/source/api/operations/point-to-point.rst b/doc/rst/source/api/operations/point-to-point.rst
new file mode 100644
index 000000000..a34ff9df4
--- /dev/null
+++ b/doc/rst/source/api/operations/point-to-point.rst
@@ -0,0 +1,194 @@
+.. _point-to-point:
+
+Point-To-Point Operations
+**************************
+
+Point-to-point operations enable direct communication between two specific entities, facilitating data exchange, synchronization, and coordination within a parallel computing environment. 
+
+The following point-to-point operations are available in oneCCL: 
+
+* ``send``
+* ``recv``
+
+send
+====
+
+``send`` is a blocking point-to-point communication operation that transfers data from a designated memory buffer (``buf``) to a specific peer rank.
+
+.. code:: cpp
+
+    event CCL_API send(void *buf,   
+              size_t count,
+              datatype dtype, 
+              int peer, 
+              const communicator &comm, 
+              const stream &stream, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+**Parameters**
+
+* ``buf`` - A buffer with ``dtype`` count elements that contains the data to be sent.
+* ``count`` - The number of ``dtype`` elements in a ``buf``.  
+* ``dtype``- The datatype of elements in a ``buf``.  
+* ``peer`` - A destination rank.  
+* ``comm`` - A communicator for which the operation is performed. 
+* ``stream`` - A stream associated with the operation. 
+* ``attr`` - Optional attributes to customize the operation. 
+* ``deps`` - An optional vector of the events, on which the operation should depend. 
+
+**Returns**
+  
+``ccl::event`` - An object to track the progress of the operation. 
+
+.. code:: cpp
+
+    event CCL_API send(void* buf, 
+              size_t count, 
+              datatype dtype, 
+              int peer, 
+              const communicator &comm, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts.
+
+.. code:: cpp
+
+    template <class BufferType, 
+        class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type> 
+    event CCL_API send(BufferType *buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const stream &stream, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event>& deps = {}); 
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts.: 
+
+.. code:: cpp
+
+    event CCL_API send(BufferType *buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts.
+
+.. code:: cpp
+
+    event CCL_API send(BufferObjectType &buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const stream &stream, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts.
+
+.. code:: cpp
+
+    event CCL_API send(BufferObjectType &buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+
+
+recv
+=====
+
+``recv`` is a blocking point-to-point communication operation that receives data from a peer rank in a memory buffer.  
+
+.. code:: cpp
+
+   event CCL_API recv(void *buf,     
+             size_t count,              
+             datatype dtype, 
+             int peer, 
+             const communicator &comm, 
+             const stream &stream, 
+             const pt2pt_attr &attr = default_pt2pt_attr, 
+             const vector_class<event> &deps = {});  
+
+**Parameters**
+
+* ``buf`` - A buffer with ``dtype`` count elements that contains where the data is received.
+* ``count`` - The number of ``dtype`` elements in a ``buf``.  
+* ``dtype``- The datatype of elements in a ``buf``.  
+* ``peer`` - A source rank.  
+* ``comm`` - A communicator for which the operation is performed. 
+* ``dtream`` - A stream associated with the operation. 
+* ``attr`` - Optional attributes to customize the operation. 
+* ``deps`` - An optional vector of the events, on which the operation should depend. 
+
+
+**Returns:**
+
+``ccl::event`` - An object to track the progress of the operation. 
+
+.. code:: cpp
+
+    event CCL_API recv(void *buf, 
+              size_t count, 
+              datatype dtype, 
+              int peer, 
+              const communicator &comm, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event>& deps = {}); 
+
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts. 
+
+.. code:: cpp
+
+    template <class BufferType, 
+        class = typename std::enable_if<is_native_type_supported<BufferType>(), event>::type> 
+    event CCL_API recv(BufferType *buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const stream &stream, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts. 
+
+.. code:: cpp
+
+    event CCL_API recv(BufferType *buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts.
+
+.. code:: cpp
+
+    event CCL_API recv(BufferObjectType &buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const stream &stream, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
+
+
+Below you can find an overloaded member function provided for the convenience. It differs from the above function only in what argument(s) it accepts. 
+
+.. code:: cpp
+    
+    event CCL_API recv(BufferObjectType &buf, 
+              size_t count, 
+              int peer, 
+              const communicator &comm, 
+              const pt2pt_attr &attr = default_pt2pt_attr, 
+              const vector_class<event> &deps = {}); 
diff --git a/doc/rst/source/conf.py b/doc/rst/source/conf.py
index f04a1c83c..3999bbef9 100755
--- a/doc/rst/source/conf.py
+++ b/doc/rst/source/conf.py
@@ -18,7 +18,7 @@
 # -- Project information -----------------------------------------------------
 
 project = 'oneCCL'
-copyright = '2019–2022'
+copyright = '2023'
 author = 'Intel'
 
 # The full version, including alpha/beta/rc tags
@@ -113,7 +113,7 @@
 # Theme options
 html_theme_options = {
     'repository_url': 'https://github.com/oneapi-src/oneCCL',
-    'path_to_docs': 'doc/source',
+    'path_to_docs': 'doc/rst/source',
     'use_issues_button': True,
     'use_edit_page_button': True,
     'repository_branch': 'master',
diff --git a/doc/rst/source/env-variables.rst b/doc/rst/source/env-variables.rst
index 94aa58573..b1c9ab64a 100644
--- a/doc/rst/source/env-variables.rst
+++ b/doc/rst/source/env-variables.rst
@@ -2,7 +2,7 @@
 Environment Variables
 =====================
 
-Collective algorithms selection
+Collective Algorithms Selection
 ###############################
 oneCCL supports collective operations for the host (CPU) memory buffers and device (GPU) memory buffers. Below you can see how to select the collective algorithm depending on the type of buffer being utilized. 
 
@@ -476,7 +476,7 @@ CCL_WORKER_AFFINITY
        The i-th local worker is pinned to the i-th core in the list.
        For example ``<a>,<b>-<c>`` defines list of cores contaning core with number ``<a>``
        and range of cores with numbers from ``<b>`` to ``<c>``.
-       The number should not exceed the number of cores available on the system.
+       The core number should not exceed the number of cores available on the system. The length of the list should be equal to the number of workers.
 
 **Description**
 
@@ -602,7 +602,6 @@ This capability requires OFI as the transport (``CCL_ATL_TRANSPORT=ofi``).
 
 The OFI/SHM provider has support to utilize the `Intel(R) Data Streaming Accelerator* (DSA) <https://01.org/blogs/2019/introducing-intel-data-streaming-accelerator>`_. 
 To run it with DSA*, you need:
-
 * Linux* OS kernel support for the DSA* shared work queues
 * Libfabric* 1.17 or later
 
@@ -1134,3 +1133,59 @@ CCL_ZE_LIBRARY_PATH
 **Description**
 
 Set this environment variable to specify the name and full path to ``Level-Zero`` library. The path should be absolute and validated. Set this variable if ``Level-Zero`` is not located in the default path. By default |product_short| uses ``libze_loader.so`` name for dynamic loading.
+
+
+Point-To-Point Operations
+*************************
+
+CCL_RECV 
+#########
+
+**Syntax**
+
+::
+
+  CCL_RECV=<value>
+
+**Arguments**
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+
+   * - <value>
+     - Description
+   * - ``direct``
+     - Based on the MPI*/OFI* transport layer.
+   * - ``topo``
+     - Uses XeLinks across GPUs in a multi-GPU node. Default for GPU buffers.  
+   * - ``offload``
+     - Based on the MPI*/OFI* transport layer and GPU RDMA when supported by the hardware.
+
+
+
+CCL_SEND 
+#########
+
+**Syntax**
+
+::
+
+  CCL_SEND=<value>
+
+**Arguments**
+
+.. list-table::
+   :widths: 25 50
+   :header-rows: 1
+   :align: left
+
+   * - <value>
+     - Description
+   * - ``direct``
+     - Based on the MPI*/OFI* transport layer.
+   * - ``topo``
+     - Uses XeLinks across GPUs in a multi-GPU node. Default for GPU buffers.  
+   * - ``offload``
+     - Based on the MPI*/OFI* transport layer and GPU RDMA when supported by the hardware.
diff --git a/doc/rst/source/introduction/installation.rst b/doc/rst/source/introduction/installation.rst
index 452e7fbdb..bd74cf699 100644
--- a/doc/rst/source/introduction/installation.rst
+++ b/doc/rst/source/introduction/installation.rst
@@ -91,24 +91,3 @@ You can customize CLI-based installation (for example, specify directory, compil
   ::
 
      make -j VERBOSE=1 install
-
-.. _prerequisites:
-
-Environment Setup
-*****************
-
-Before you start using |product_short|, make sure to set up the library environment. 
-There are two ways to set up the environment:
-
-- Using standalone |product_short| package installed into ``<ccl_install_dir>``:
-
-    .. prompt:: bash
-
-        source <ccl_install_dir>/env/setvars.sh
-
-
-- Using |product_short| from |base_tk| installed into ``<toolkit_install_dir>`` (``/opt/intel/inteloneapi`` by default):
-
-    .. prompt:: bash
-
-        source <toolkit_install_dir>/setvars.sh
diff --git a/doc/rst/source/introduction/sample.rst b/doc/rst/source/introduction/sample.rst
index 3ca73fadd..ca3396386 100644
--- a/doc/rst/source/introduction/sample.rst
+++ b/doc/rst/source/introduction/sample.rst
@@ -13,7 +13,7 @@ Build details
 
 #. :ref:`Build <enable_sycl>` |product_short| with ``SYCL`` support (only Intel\ |reg|\  oneAPI DPC++/C++ Compiler is supported).
 
-#. :ref:`Set up <prerequisites>` the library environment.
+#. `Set up the library environment <https://www.intel.com/content/www/us/en/docs/oneccl/get-started-guide/current/overview.html#SAMPLE-APPLICATION>`_.
 
 #. Use the C++ driver with the -fsycl option to build the sample:
 
@@ -34,3 +34,5 @@ To run the sample, use the following command:
     mpiexec <parameters> ./sample
 
 where ``<parameters>`` represents optional mpiexec parameters such as node count, processes per node, hosts, and so on.
+
+.. note:: Explore the complete list of oneAPI code samples in the `oneAPI Samples Catalog <https://oneapi-src.github.io/oneAPI-samples/>`_. These samples were designed to help you develop, offload, and optimize multiarchitecture applications targeting CPUs, GPUs, and FPGAs.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 68162f389..c54bff196 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,10 +23,6 @@ message(STATUS "PROJECT_SOURCE_DIR: ${PROJECT_SOURCE_DIR}")
 
 if (NOT DEFINED ENV{COMPUTE_BACKEND})
     message(STATUS "COMPUTE_BACKEND is not defined")
-    if (${CMAKE_CXX_COMPILER} MATCHES ".*icpx")
-        set(COMPUTE_BACKEND "dpcpp" CACHE STRING "compute backend value")
-        message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND} (set by default)")
-    endif()
 else()
     message(STATUS "COMPUTE_BACKEND: ${COMPUTE_BACKEND} (set by user)")
 endif()
@@ -48,7 +44,12 @@ if (DEFINED ENV{I_MPI_ROOT})
     set(I_MPI_ROOT "$ENV{I_MPI_ROOT}")
 endif()
 
+if (DEFINED ENV{ONEAPI_ROOT})
+    set(ONEAPI_ROOT "$ENV{ONEAPI_ROOT}")
+endif()
+
 message(STATUS "CCL_ROOT: ${CCL_ROOT}")
+message(STATUS "ONEAPI_ROOT: ${ONEAPI_ROOT}")
 message(STATUS "CCL_CONFIGURATION: ${CCL_CONFIGURATION}")
 
 if (NOT DEFINED ${CCL_INSTALL_BENCHMARKS})
@@ -81,6 +82,12 @@ set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${CXX_COMP
 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+#add path with sycl headers explicitly
+if (DEFINED ONEAPI_ROOT)
+   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -isystem ${ONEAPI_ROOT}/include")
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${ONEAPI_ROOT}/include")
+endif()
+
 if ("${COMPUTE_BACKEND}" STREQUAL "dpcpp")
     set(CMAKE_CLANG_FLAGS "-fsycl")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -lsycl")
@@ -95,7 +102,7 @@ endif()
 
 set(GCC_BF16_MIN_SUPPORTED "4.9.0")
 
-if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_BF16_MIN_SUPPORTED}))
+if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel" OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM" OR  ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" AND NOT ${CMAKE_C_COMPILER_VERSION} VERSION_LESS ${GCC_BF16_MIN_SUPPORTED}))
     add_definitions(-DCCL_BF16_COMPILER)
     set(CCL_BF16_COMPILER ON)
 else()
@@ -104,7 +111,8 @@ endif()
 message(STATUS "BF16 AVX512F compiler: ${CCL_BF16_COMPILER}")
 
 if (CCL_BF16_COMPILER)
-    if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
+    if ((${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "IntelLLVM"
+        OR ${CMAKE_C_COMPILER_ID} STREQUAL "GNU"))
         add_definitions(-DCCL_BF16_TARGET_ATTRIBUTES)
         message(STATUS "BF16 target attributes: yes")
     else()
@@ -118,6 +126,8 @@ add_subdirectory(cpu)
 
 if ("${COMPUTE_BACKEND}" STREQUAL "dpcpp")
     add_subdirectory(sycl)
+    #TODO: add cpu support
+    add_subdirectory(pt2pt)
 endif()
 add_subdirectory(common)
 add_subdirectory(benchmark)
diff --git a/examples/benchmark/include/benchmark.hpp b/examples/benchmark/include/benchmark.hpp
index 6beb52a42..4362a392e 100644
--- a/examples/benchmark/include/benchmark.hpp
+++ b/examples/benchmark/include/benchmark.hpp
@@ -105,15 +105,6 @@ void print_help_usage(const char* app) {
           ext_values_names[DEFAULT_EXT_VALUES].c_str());
 }
 
-template <class Dtype, class Container>
-std::string find_str_val(Container& mp, const Dtype& key) {
-    typename std::map<Dtype, std::string>::iterator it;
-    it = mp.find(key);
-    if (it != mp.end())
-        return it->second;
-    return NULL;
-}
-
 template <class Dtype, class Container>
 bool find_key_val(ccl::reduction& key, Container& mp, const Dtype& val) {
     for (auto& i : mp) {
@@ -132,24 +123,6 @@ bool is_check_values_enabled(check_values_t check_values) {
     return ret;
 }
 
-int check_supported_options(const std::string& option_name,
-                            const std::string& option_value,
-                            const std::set<std::string>& supported_option_values) {
-    std::stringstream sstream;
-
-    if (supported_option_values.find(option_value) == supported_option_values.end()) {
-        PRINT("unsupported %s: %s", option_name.c_str(), option_value.c_str());
-
-        std::copy(supported_option_values.begin(),
-                  supported_option_values.end(),
-                  std::ostream_iterator<std::string>(sstream, " "));
-        PRINT("supported values: %s", sstream.str().c_str());
-        return -1;
-    }
-
-    return 0;
-}
-
 int set_backend(const std::string& option_value, backend_type_t& backend) {
     std::string option_name = "backend";
     std::set<std::string> supported_option_values{ backend_names[BACKEND_HOST] };
@@ -551,16 +524,6 @@ void adjust_elem_counts(user_options_t& options) {
     }
 }
 
-bool is_valid_integer_option(const char* option) {
-    std::string str(option);
-    bool only_digits = (str.find_first_not_of("0123456789") == std::string::npos);
-    return (only_digits && atoi(option) >= 0);
-}
-
-bool is_valid_integer_option(int option) {
-    return (option >= 0);
-}
-
 void adjust_user_options(user_options_t& options) {
     adjust_elem_counts(options);
 }
diff --git a/examples/benchmark/include/coll.hpp b/examples/benchmark/include/coll.hpp
index 446c9751a..dbf9a966b 100644
--- a/examples/benchmark/include/coll.hpp
+++ b/examples/benchmark/include/coll.hpp
@@ -199,7 +199,7 @@ struct base_coll {
                 max_error = expected_float * g;
             }
         }
-        if (fabs(max_error) < fabs(expected_float - value_float)) {
+        if (std::fabs(max_error) < std::fabs(expected_float - value_float)) {
             return 1;
         }
         return 0;
diff --git a/examples/cpu/cpu_allreduce_bf16_test.cpp b/examples/cpu/cpu_allreduce_bf16_test.cpp
index 07401604c..28f891408 100644
--- a/examples/cpu/cpu_allreduce_bf16_test.cpp
+++ b/examples/cpu/cpu_allreduce_bf16_test.cpp
@@ -35,7 +35,7 @@
         for (size_t i = 0; i < COUNT; i++) { \
             double expected = ((comm_size * (comm_size - 1) / 2) + ((float)(i)*comm_size)); \
             double max_error = g * expected; \
-            if (fabs(max_error) < fabs(expected - recv_buf[i])) { \
+            if (std::fabs(max_error) < std::fabs(expected - recv_buf[i])) { \
                 printf( \
                     "[%d] got recv_buf[%zu] = %0.7f, but expected = %0.7f, max_error = %0.16f\n", \
                     comm.rank(), \
diff --git a/examples/external_launcher/run.sh b/examples/external_launcher/run.sh
index fc32213fc..8d1d2d328 100755
--- a/examples/external_launcher/run.sh
+++ b/examples/external_launcher/run.sh
@@ -211,14 +211,17 @@ run_binary()
             log_files=("${log_files[@]}" "${log_file}")
 
             cmd="$dir/run_binary.sh -s ${SIZE} -r ${rank} -ls ${local_size} -lr ${i}"
-            cmd="${cmd} -cv ${VARS} -lf ${log_file} -km ${kvs_mode} -kp ${kvs_param}"
+            cmd="${cmd} -cclv ${VARS} -lf ${log_file} -km ${kvs_mode} -kp ${kvs_param}"
             if [[ -z ${I_MPI_ROOT} ]]
             then
                 cmd="${cmd} -mv ${IMPI_PATH}/env/vars.sh"
             else
                 cmd="${cmd} -mv ${I_MPI_ROOT}/env/vars.sh"
             fi
-
+            if [[ ! -z "${SYCL_BUNDLE_ROOT}" && "${ENABLE_CODECOV}" = "yes" ]];
+            then
+                cmd="${cmd} -cv ${SYCL_BUNDLE_ROOT}/env/vars.sh"
+            fi
             timeout_prefix="timeout -k $((cmd_timeout))s $((cmd_timeout))s"
             run_cmd ${host} "${cmd}" "${timeout_prefix}"
         done
diff --git a/examples/external_launcher/run_binary.sh b/examples/external_launcher/run_binary.sh
index eb84b552d..95ae400ea 100755
--- a/examples/external_launcher/run_binary.sh
+++ b/examples/external_launcher/run_binary.sh
@@ -28,29 +28,30 @@ print_help()
     echo_log "    ./${BASENAME}.sh [options]"
     echo_log ""
     echo_log "<options>:"
-    echo_log "    -s   Total number of ranks"
-    echo_log "    -r   Rank"
-    echo_log "    -ls  Local number of ranks"
-    echo_log "    -lr  Local rank"
-    echo_log "    -cv  Path to oneCCL variables script"
-    echo_log "    -mv  Path to IMPI variables script"
-    echo_log "    -lf  Log file"
-    echo_log "    -km  Create KVS mode"
-    echo_log "    -kp  Create KVS param"
+    echo_log "    -s     Total number of ranks"
+    echo_log "    -r     Rank"
+    echo_log "    -ls    Local number of ranks"
+    echo_log "    -lr    Local rank"
+    echo_log "    -cclv  Path to oneCCL variables script"
+    echo_log "    -mv    Path to IMPI variables script"
+    echo_log "    -cv    Path to Compiler variables script (not mandatory)"
+    echo_log "    -lf    Log file"
+    echo_log "    -km    Create KVS mode"
+    echo_log "    -kp    Create KVS param"
     echo_log ""
     echo_log "Example:"
-    echo_log "    ./${BASENAME}.sh -s 4 -r 0 -ls 2 -lr 0 -cv <ccl_vars> -mv <mpi_vars> -lf <log_file> -km <mode> -kp <param>"
+    echo_log "    ./${BASENAME}.sh -s 4 -r 0 -ls 2 -lr 0 -cclv <ccl_vars> -mv <mpi_vars> -cv <compiler_vars> -lf <log_file> -km <mode> -kp <param>"
     echo_log ""
 }
 
 parse_arguments()
 {
-    if [ $# -ne 18 ];
+    #NOTE: by condition below we can check case when w/ and w/o -cv option
+    if [ $# -ne 18 || $# -ne 20 ]
     then
         print_help
         exit 1
     fi
-
     read_count=0
 
     while [ $# -ne 0 ]
@@ -72,7 +73,7 @@ parse_arguments()
                 LOCAL_RANK=$2
                 read_count=$((read_count+1))
                 ;;
-            "-cv"|"--ccl_vars")
+            "-cclv"|"--ccl_vars")
                 CCL_VARS=$2
                 read_count=$((read_count+1))
                 ;;
@@ -80,6 +81,10 @@ parse_arguments()
                 MPI_VARS=$2
                 read_count=$((read_count+1))
                 ;;
+             "-cv"|"--compiler_vars")
+                COMPILER_VARS=$2
+                read_count=$((read_count+1))
+                ;;
             "-lf"|"--log_file")
                 LOG_FILE=$2
                 read_count=$((read_count+1))
@@ -102,8 +107,12 @@ parse_arguments()
         shift
         shift
     done
-
-    expected_read_count=9
+    if [ -z ${COMPILER_VARS} ]
+    then
+        expected_read_count=9
+    else
+        expected_read_count=10
+    fi
     if [ "${read_count}" -ne "${expected_read_count}" ];
     then
         echo_log "ERROR: unexpected number of read options ($read_count), expected ${expected_read_count}"
@@ -114,15 +123,16 @@ parse_arguments()
     echo_log "-----------------------------------------------------------"
     echo_log "PARAMETERS"
     echo_log "-----------------------------------------------------------"
-    echo_log "SIZE       = ${SIZE}"
-    echo_log "RANK       = ${RANK}"
-    echo_log "LOCAL_SIZE = ${LOCAL_SIZE}"
-    echo_log "LOCAL_RANK = ${LOCAL_RANK}"
-    echo_log "CCL_VARS   = ${CCL_VARS}"
-    echo_log "MPI_VARS   = ${MPI_VARS}"
-    echo_log "LOG_FILE   = ${LOG_FILE}"
-    echo_log "KVS_MODE   = ${KVS_MODE}"
-    echo_log "KVS_PARAM  = ${KVS_PARAM}"
+    echo_log "SIZE            = ${SIZE}"
+    echo_log "RANK            = ${RANK}"
+    echo_log "LOCAL_SIZE      = ${LOCAL_SIZE}"
+    echo_log "LOCAL_RANK      = ${LOCAL_RANK}"
+    echo_log "CCL_VARS        = ${CCL_VARS}"
+    echo_log "MPI_VARS        = ${MPI_VARS}"
+    echo_log "COMPILER_VARS   = ${COMPILER_VARS}"
+    echo_log "LOG_FILE        = ${LOG_FILE}"
+    echo_log "KVS_MODE        = ${KVS_MODE}"
+    echo_log "KVS_PARAM       = ${KVS_PARAM}"
     echo_log "-----------------------------------------------------------"
 }
 
@@ -142,6 +152,11 @@ function run()
     fi
     echo $LOG_FILE
 
+    if [[ ! -z "${COMPILER_VARS}" ]];
+    then
+        echo "Compiler variables script"
+        source ${COMPILER_VARS}
+    fi
     if [[ $CCL_VARS == *"setvars.sh"* ]];
     then
         echo "Use standalone CCL variables script"
diff --git a/examples/include/base.hpp b/examples/include/base.hpp
index 436bcaa4e..4371902d3 100644
--- a/examples/include/base.hpp
+++ b/examples/include/base.hpp
@@ -23,6 +23,7 @@
 #include <cstring>
 #include <functional>
 #include <iostream>
+#include <iterator>
 #include <math.h>
 #include <mpi.h>
 #include <stdexcept>
@@ -119,3 +120,40 @@ inline void mpi_finalize() {
     if (!is_finalized)
         MPI_Finalize();
 }
+
+inline bool is_valid_integer_option(const char* option) {
+    std::string str(option);
+    bool only_digits = (str.find_first_not_of("0123456789") == std::string::npos);
+    return (only_digits && atoi(option) >= 0);
+}
+
+inline bool is_valid_integer_option(int option) {
+    return (option >= 0);
+}
+
+inline int check_supported_options(const std::string& option_name,
+                                   const std::string& option_value,
+                                   const std::set<std::string>& supported_option_values) {
+    std::stringstream sstream;
+
+    if (supported_option_values.find(option_value) == supported_option_values.end()) {
+        PRINT("unsupported %s: %s", option_name.c_str(), option_value.c_str());
+
+        std::copy(supported_option_values.begin(),
+                  supported_option_values.end(),
+                  std::ostream_iterator<std::string>(sstream, " "));
+        PRINT("supported values: %s", sstream.str().c_str());
+        return -1;
+    }
+
+    return 0;
+}
+
+template <class Dtype, class Container>
+std::string find_str_val(Container& mp, const Dtype& key) {
+    typename std::map<Dtype, std::string>::iterator it;
+    it = mp.find(key);
+    if (it != mp.end())
+        return it->second;
+    return NULL;
+}
diff --git a/examples/pt2pt/CMakeLists.txt b/examples/pt2pt/CMakeLists.txt
new file mode 100644
index 000000000..93e9d40f3
--- /dev/null
+++ b/examples/pt2pt/CMakeLists.txt
@@ -0,0 +1,44 @@
+#
+# Copyright 2016-2020 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+file(GLOB sources "./src/*.c" "./src/*.cpp")
+
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
+    set(CMAKE_CXX_STANDARD 17)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+endif()
+
+include_directories(include)
+include_directories(src)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+link_directories(${EXAMPLES_LIB_DIRS})
+
+foreach(src ${sources})
+    get_filename_component(executable ${src} NAME_WE)
+    add_executable(${executable} ${src})
+
+    target_include_directories(${executable} PRIVATE ${EXAMPLES_INC_DIRS})
+    target_link_libraries(${executable} PRIVATE ccl)
+    target_link_libraries(${executable} PUBLIC pthread)
+    target_link_libraries(${executable} PUBLIC rt)
+    target_link_libraries(${executable} PUBLIC m)
+    target_link_libraries(${executable} PUBLIC dl)
+    target_link_libraries(${executable} PUBLIC -L${I_MPI_ROOT}/lib/release/)
+    target_link_libraries(${executable} PUBLIC mpi)
+    target_link_libraries(${executable} PUBLIC ${COMPUTE_BACKEND_TARGET_NAME})
+    install(TARGETS ${executable} RUNTIME DESTINATION ${CCL_INSTALL_EXAMPLES}/pt2pt OPTIONAL)
+endforeach()
diff --git a/examples/pt2pt/include/pt2pt_base.hpp b/examples/pt2pt/include/pt2pt_base.hpp
new file mode 100644
index 000000000..cf1ff11f9
--- /dev/null
+++ b/examples/pt2pt/include/pt2pt_base.hpp
@@ -0,0 +1,296 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <getopt.h>
+
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "types.hpp"
+
+typedef struct user_options_t {
+    uint32_t cache;
+    uint32_t iters;
+
+    std::vector<int> peers;
+    uint32_t queue;
+    int min_elem_count;
+    int max_elem_count;
+    validate_values_t validate;
+    uint32_t warmup_iters;
+    uint32_t wait;
+    int window_size;
+
+    user_options_t() {
+        iters = DEFAULT_ITERS;
+        warmup_iters = DEFAULT_WARMUP_ITERS;
+        cache = DEFAULT_CACHE_OPS;
+        queue = DEFAULT_QUEUE;
+        wait = DEFAULT_WAIT;
+        min_elem_count = DEFAULT_MIN_ELEM_COUNT;
+        max_elem_count = DEFAULT_MAX_ELEM_COUNT;
+        validate = DEFAULT_VALIDATE;
+        // for bw benchmark
+        window_size = DEFAULT_WINDOW_SIZE;
+
+        peers.reserve(2);
+        // filling out with the default values
+        peers.push_back(0);
+        peers.push_back(1);
+    }
+} user_options_t;
+
+int parse_user_options(int& argc, char**(&argv), user_options_t& options) {
+    int ch;
+    int errors = 0;
+
+    char short_options[1024] = { 0 };
+    const char* base_options = "i:w:c:q:s:f:t:v:m:h";
+    memcpy(short_options, base_options, strlen(base_options));
+
+    struct option getopt_options[] = {
+        { "iters", required_argument, nullptr, 'i' },
+        { "warmup_iters", required_argument, nullptr, 'w' },
+        { "cache", required_argument, nullptr, 'c' },
+        { "queue", required_argument, nullptr, 'q' },
+        { "wait", required_argument, nullptr, 's' },
+        { "min_elem_count", required_argument, nullptr, 'f' },
+        { "max_elem_count", required_argument, nullptr, 't' },
+        { "validate", required_argument, nullptr, 'v' },
+        { "window", required_argument, nullptr, 'm' },
+        { "help", no_argument, nullptr, 'h' },
+        //TODO: { "peers", required_argument, nullptr, 'p' },
+        { nullptr, 0, nullptr, 0 } // required at end of array.
+    };
+
+    while ((ch = getopt_long(argc, argv, short_options, getopt_options, nullptr)) != -1) {
+        switch (ch) {
+            case 'i':
+                if (is_valid_integer_option(optarg)) {
+                    options.iters = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 'w':
+                if (is_valid_integer_option(optarg)) {
+                    options.warmup_iters = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 'c':
+                if (is_valid_integer_option(optarg)) {
+                    options.cache = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 'q':
+                if (is_valid_integer_option(optarg)) {
+                    options.queue = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 'f':
+                if (is_valid_integer_option(optarg)) {
+                    options.min_elem_count = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 't':
+                if (is_valid_integer_option(optarg)) {
+                    options.max_elem_count = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 's':
+                if (is_valid_integer_option(optarg)) {
+                    options.wait = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 'v':
+                if (set_validate_values(optarg, options.validate)) {
+                    PRINT("failed to parse 'check' option");
+                    errors++;
+                }
+                break;
+            case 'm':
+                if (is_valid_integer_option(optarg)) {
+                    options.window_size = atoll(optarg);
+                }
+                else
+                    errors++;
+                break;
+            case 'h': return INVALID_RETURN;
+            default:
+                PRINT("failed to parse unknown option");
+                errors++;
+                break;
+        }
+    }
+
+    if (optind < argc) {
+        PRINT("non-option ARGV-elements given");
+        errors++;
+    }
+
+    if (errors > 0) {
+        PRINT("found %d errors while parsing user options", errors);
+        for (int idx = 0; idx < argc; idx++) {
+            PRINT("arg %d: %s", idx, argv[idx]);
+        }
+        return -1;
+    }
+    return 0;
+}
+
+auto create_attr(const bool is_cache, const int count, const std::string& match_id_suffix) {
+    auto attr = ccl::create_operation_attr<ccl::pt2pt_attr>();
+    if (is_cache) {
+        std::string matchId = "_len_" + std::to_string(count) + match_id_suffix;
+        attr.set<ccl::operation_attr_id::match_id>(ccl::string_class(matchId));
+        attr.set<ccl::operation_attr_id::to_cache>(true);
+    }
+    return attr;
+}
+
+void print_timings(ccl::communicator& comm,
+                   const user_options_t& options,
+                   const double total_time,
+                   const int count,
+                   const std::string mesure_str) {
+    static bool print_once = false;
+
+    if (!print_once && comm.rank() == 0) {
+        std::stringstream ss;
+        ss << std::right << std::setw(COL_WIDTH - 4) << "#bytes" << std::setw(COL_WIDTH)
+           << "#repetitions" << std::setw(COL_WIDTH) << mesure_str << std::endl;
+        std::cout << ss.str();
+        print_once = true;
+    }
+
+    std::stringstream ss;
+
+    ss << std::right << std::fixed << std::setw(COL_WIDTH - 4) << count << std::setw(COL_WIDTH)
+       << options.iters << std::setw(COL_WIDTH) << std::setprecision(COL_PRECISION) << total_time
+       << std::setw(COL_WIDTH) << std::endl;
+    std::cout << ss.str();
+}
+
+template <class Dtype>
+void check_buffers(sycl::queue q,
+                   const user_options_t& options,
+                   const int count,
+                   const size_t iter_idx,
+                   Dtype buf_recv) {
+    bool failed = false;
+    sycl::buffer<int> check_buf(count);
+
+    auto e = q.submit([&](auto& h) {
+        sycl::accessor check_buf_acc(check_buf, h, sycl::write_only);
+        h.parallel_for(count, [=](auto id) {
+            if (buf_recv[id] != static_cast<int>(id + iter_idx)) {
+                check_buf_acc[id] = INVALID_VALUE;
+            }
+        });
+    });
+
+    if (options.wait) {
+        e.wait_and_throw();
+    }
+
+    {
+        sycl::host_accessor check_buf_acc(check_buf, sycl::read_only);
+        for (int j = 0; j < count; j++) {
+            if (check_buf_acc[j] == INVALID_VALUE) {
+                failed = true;
+                break;
+            }
+        }
+    }
+
+    if (failed) {
+        std::cout << "FAILED: iter_idx: " << iter_idx << ", count: " << count << std::endl;
+        ASSERT(0, "unexpected value");
+    }
+}
+
+void print_help_usage(const char* app) {
+    PRINT("\nUSAGE:\n"
+          "\t%s [OPTIONS]\n\n"
+          "OPTIONS:\n"
+          "\t[-i,--iters <iteration count>]: %d\n"
+          "\t[-w,--warmup_iters <warm up iteration count>]: %d\n"
+          "\t[-c,--cache <use persistent operations>]: %d\n"
+          "\t[-q,--queue <sycl queue mode in/out order>]: %d\n"
+          "\t[-s,--wait <enable synchronization on sycl and pt2pt level>]: %d\n"
+          "\t[-f,--min_elem_count <minimum element count>]: %d\n"
+          "\t[-t,--max_elem_count <maximum element count>]: %d\n"
+          "\t[-v,--validate <validate result correctness>]: %s\n"
+          "\t[-h,--help]\n\n"
+          "example:\n\t--queue 1 --cache 0 --validate 1\n",
+          app,
+          DEFAULT_ITERS,
+          DEFAULT_WARMUP_ITERS,
+          DEFAULT_CACHE_OPS,
+          DEFAULT_QUEUE,
+          DEFAULT_WAIT,
+          DEFAULT_MIN_ELEM_COUNT,
+          DEFAULT_MAX_ELEM_COUNT,
+          validate_values_names[DEFAULT_VALIDATE].c_str());
+}
+
+void print_user_options(const std::string benchmark,
+                        const user_options_t& options,
+                        const ccl::communicator& comm) {
+    std::stringstream ss;
+
+    std::string validate_values_str = find_str_val(validate_values_names, options.validate);
+
+    ss << "\noptions:"
+       << "\n  iters:          " << options.iters << "\n  warmup_iters:   " << options.warmup_iters
+       << "\n  cache:          " << options.cache << "\n  queue:          " << options.queue
+       << "\n  wait:           " << options.wait << "\n  min_elem_count: " << options.min_elem_count
+       << "\n  max_elem_count: " << options.max_elem_count
+       << "\n  validate:       " << validate_values_str;
+
+    if (benchmark == "Bandwidth") {
+        ss << "\n  window_size:    " << options.window_size;
+    }
+
+    if (comm.rank() == 0) {
+        std::cout << ss.str() << std::endl;
+    }
+
+    ss.str("");
+
+    ss << "#------------------------------------------------------------\n"
+       << "# Benchmarking: " << benchmark << "\n"
+       << "# #processes: " << comm.size() << "\n"
+       << "#------------------------------------------------------------\n";
+
+    if (comm.rank() == 0) {
+        std::cout << ss.str() << std::endl;
+    }
+}
diff --git a/examples/pt2pt/include/pt2pt_transport.hpp b/examples/pt2pt/include/pt2pt_transport.hpp
new file mode 100644
index 000000000..e5e6c367e
--- /dev/null
+++ b/examples/pt2pt/include/pt2pt_transport.hpp
@@ -0,0 +1,157 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include <map>
+#include <vector>
+
+#include "oneapi/ccl.hpp"
+#include "sycl_base.hpp"
+#include "pt2pt_base.hpp"
+
+class transport_data {
+public:
+    static transport_data& instance();
+    static size_t get_comm_size();
+
+    int get_rank() const noexcept;
+    int get_size() const noexcept;
+
+    ccl::shared_ptr_class<ccl::kvs> get_kvs();
+
+    void init_comms(user_options_t& options);
+    std::vector<ccl::communicator>& get_comms();
+    void reset_comms();
+
+    std::vector<ccl::stream>& get_streams();
+
+    void create_sycl_queue(user_options_t& options);
+    sycl::queue get_sycl_queue();
+
+private:
+    transport_data();
+    ~transport_data();
+
+    int rank;
+    int size;
+
+    std::vector<size_t> local_ranks;
+
+    ccl::shared_ptr_class<ccl::kvs> kvs;
+    std::vector<ccl::communicator> comms;
+
+    std::vector<ccl::stream> streams;
+    sycl::queue queue;
+
+    void init_by_mpi();
+    void deinit_by_mpi();
+};
+
+transport_data::transport_data() {
+    init_by_mpi();
+}
+
+transport_data::~transport_data() {
+    deinit_by_mpi();
+}
+
+transport_data& transport_data::instance() {
+    static transport_data inst;
+    return inst;
+}
+
+size_t transport_data::get_comm_size() {
+    return transport_data::instance().get_comms()[0].size();
+}
+
+int transport_data::get_rank() const noexcept {
+    return rank;
+}
+
+int transport_data::get_size() const noexcept {
+    return size;
+}
+
+ccl::shared_ptr_class<ccl::kvs> transport_data::get_kvs() {
+    return kvs;
+}
+
+void transport_data::init_by_mpi() {
+    ccl::init();
+
+    MPI_Init(NULL, NULL);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    ccl::shared_ptr_class<ccl::kvs> kvs_candidate;
+    ccl::kvs::address_type main_addr;
+    if (rank == 0) {
+        kvs_candidate = ccl::create_main_kvs();
+        main_addr = kvs_candidate->get_address();
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+    }
+    else {
+        MPI_Bcast((void*)main_addr.data(), main_addr.size(), MPI_BYTE, 0, MPI_COMM_WORLD);
+        kvs_candidate = ccl::create_kvs(main_addr);
+    }
+    kvs = kvs_candidate;
+}
+
+void transport_data::deinit_by_mpi() {
+    MPI_Finalize();
+}
+
+std::vector<ccl::stream>& transport_data::get_streams() {
+    return streams;
+}
+
+void transport_data::init_comms(user_options_t& options) {
+    create_sycl_queue(options);
+
+    auto q = get_sycl_queue();
+
+    // create communicator
+    auto dev = ccl::create_device(q.get_device());
+    auto ctx = ccl::create_context(q.get_context());
+    comms.push_back(ccl::create_communicator(size, rank, dev, ctx, kvs));
+
+    // create stream
+    streams.push_back(ccl::create_stream(q));
+}
+
+std::vector<ccl::communicator>& transport_data::get_comms() {
+    return comms;
+}
+
+void transport_data::create_sycl_queue(user_options_t& options) {
+    sycl::property_list props{};
+    if (options.queue) {
+        props = { sycl::property::queue::in_order{} };
+    }
+
+    if (!::create_sycl_queue("gpu", rank, queue, props)) {
+        exit(INVALID_RETURN);
+    }
+}
+
+sycl::queue transport_data::get_sycl_queue() {
+    return queue;
+}
+
+void transport_data::reset_comms() {
+    comms.clear();
+    streams.clear();
+}
diff --git a/examples/pt2pt/include/types.hpp b/examples/pt2pt/include/types.hpp
new file mode 100644
index 000000000..6e78a6648
--- /dev/null
+++ b/examples/pt2pt/include/types.hpp
@@ -0,0 +1,68 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+// enums
+typedef enum { VALIDATE_OFF, VALIDATE_LAST_ITER, VALIDATE_ALL_ITERS } validate_values_t;
+
+std::map<validate_values_t, std::string> validate_values_names = {
+    std::make_pair(VALIDATE_OFF, "off"),
+    std::make_pair(VALIDATE_LAST_ITER, "last"),
+    std::make_pair(VALIDATE_ALL_ITERS, "all")
+};
+
+// defines
+#define COL_WIDTH     (18)
+#define COL_PRECISION (2)
+
+#define DEFAULT_ITERS        (16)
+#define DEFAULT_WARMUP_ITERS (16)
+
+#define DEFAULT_CACHE_OPS   (1)
+#define DEFAULT_QUEUE       (0)
+#define DEFAULT_WAIT        (1)
+#define DEFAULT_WINDOW_SIZE (64)
+
+#define DEFAULT_MIN_ELEM_COUNT (1)
+#define DEFAULT_MAX_ELEM_COUNT (33554432) // till 128 MBytes
+#define DEFAULT_VALIDATE       (VALIDATE_LAST_ITER)
+
+#define INVALID_VALUE  (-1)
+#define INVALID_RETURN (-1)
+
+int set_validate_values(const std::string& option_value, validate_values_t& validate) {
+    std::string option_name = "validate";
+
+    std::set<std::string> supported_option_values{ validate_values_names[VALIDATE_OFF],
+                                                   validate_values_names[VALIDATE_LAST_ITER],
+                                                   validate_values_names[VALIDATE_ALL_ITERS] };
+
+    if (check_supported_options(option_name, option_value, supported_option_values)) {
+        return INVALID_RETURN;
+    }
+
+    if (option_value == validate_values_names[VALIDATE_OFF]) {
+        validate = VALIDATE_OFF;
+    }
+    else if (option_value == validate_values_names[VALIDATE_LAST_ITER]) {
+        validate = VALIDATE_LAST_ITER;
+    }
+    else if (option_value == validate_values_names[VALIDATE_ALL_ITERS]) {
+        validate = VALIDATE_ALL_ITERS;
+    }
+
+    return 0;
+}
diff --git a/examples/pt2pt/src/ccl_bw.cpp b/examples/pt2pt/src/ccl_bw.cpp
new file mode 100644
index 000000000..b74a064d3
--- /dev/null
+++ b/examples/pt2pt/src/ccl_bw.cpp
@@ -0,0 +1,168 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "base.hpp"
+#include "sycl_base.hpp"
+#include "pt2pt_transport.hpp"
+
+#include "oneapi/ccl.hpp"
+
+int main(int argc, char* argv[]) {
+    user_options_t options;
+
+    if (parse_user_options(argc, argv, options)) {
+        print_help_usage(argv[0]);
+        exit(INVALID_RETURN);
+    }
+
+    auto& transport = transport_data::instance();
+    transport.init_comms(options);
+
+    auto q = transport.get_sycl_queue();
+    auto rank = transport.get_rank();
+    auto& comms = transport.get_comms();
+    auto streams = transport.get_streams();
+
+    print_user_options("Bandwidth", options, comms[0]);
+
+    double start_t = 0.0, end_t = 0.0, diff_t = 0.0;
+    size_t dtype_size = sizeof(ccl::datatype::int32);
+
+    for (int count = options.min_elem_count; count <= options.max_elem_count;
+         count = (count ? count * 2 : 1)) {
+        auto buf_send = sycl::malloc_device<int>(count, q);
+        auto buf_recv = sycl::malloc_device<int>(count, q);
+
+        // oneCCL spec defines attr identifiers that may be used to fill operation
+        // attribute objects. It means for every pair of op, we have to keep own unique attr
+        // because we may have conflicts between 2 different pairs with one common attr.
+        auto attr = create_attr(options.cache, count, to_string(0));
+        auto attr1 = create_attr(options.cache, count, to_string(1));
+
+        if (rank == options.peers[0]) {
+            for (size_t iter_idx = 0; iter_idx < (options.warmup_iters + options.iters);
+                 iter_idx++) {
+                // init the buffer
+                auto e = q.submit([&](auto& h) {
+                    h.parallel_for(count, [=](auto id) {
+                        buf_send[id] = id + iter_idx;
+                        buf_recv[id] = INVALID_VALUE;
+                    });
+                });
+
+                if (options.wait) {
+                    e.wait_and_throw();
+                }
+
+                if (iter_idx == options.warmup_iters) {
+                    ccl::barrier(comms[0]);
+                    start_t = MPI_Wtime();
+                }
+
+                for (int j = 0; j < options.window_size; j++) {
+                    auto send_event = ccl::send(buf_send,
+                                                count,
+                                                ccl::datatype::int32,
+                                                options.peers[1],
+                                                comms[0],
+                                                streams[0],
+                                                attr);
+                    if (options.wait) {
+                        send_event.wait();
+                    }
+                }
+
+                auto recv_event = ccl::recv(buf_recv,
+                                            1,
+                                            ccl::datatype::int32,
+                                            options.peers[1],
+                                            comms[0],
+                                            streams[0],
+                                            attr1);
+                if (options.wait) {
+                    recv_event.wait();
+                }
+
+                end_t = MPI_Wtime();
+                diff_t = end_t - start_t;
+            }
+        }
+        else if (rank == options.peers[1]) {
+            for (size_t iter_idx = 0; iter_idx < (options.warmup_iters + options.iters);
+                 iter_idx++) {
+                // init the buffer
+                auto e = q.submit([&](auto& h) {
+                    h.parallel_for(count, [=](auto id) {
+                        buf_send[id] = id + iter_idx;
+                        buf_recv[id] = INVALID_VALUE;
+                    });
+                });
+                if (options.wait) {
+                    e.wait_and_throw();
+                }
+
+                if (iter_idx == options.warmup_iters) {
+                    ccl::barrier(comms[0]);
+                }
+
+                for (int j = 0; j < options.window_size; j++) {
+                    auto recv_event = ccl::recv(buf_recv,
+                                                count,
+                                                ccl::datatype::int32,
+                                                options.peers[0],
+                                                comms[0],
+                                                streams[0],
+                                                attr);
+                    if (options.wait) {
+                        recv_event.wait();
+                    }
+                }
+
+                // we can send 1 count here, this pair is for aligning
+                // no need a big count
+                auto send_event = ccl::send(buf_send,
+                                            1,
+                                            ccl::datatype::int32,
+                                            options.peers[0],
+                                            comms[0],
+                                            streams[0],
+                                            attr1);
+                if (options.wait) {
+                    send_event.wait();
+                }
+                if (options.validate == VALIDATE_ALL_ITERS ||
+                    (options.validate == VALIDATE_LAST_ITER &&
+                     iter_idx == (options.warmup_iters + options.iters) - 1)) {
+                    check_buffers(q, options, count, iter_idx, buf_recv);
+                }
+            }
+        }
+
+        if (rank == options.peers[0]) {
+            double bandwidth_t =
+                (count * dtype_size / 1e6 * options.iters * options.window_size) / diff_t;
+            print_timings(comms[0], options, bandwidth_t, count * dtype_size, "Mbytes/sec");
+        }
+
+        sycl::free(buf_send, q);
+        sycl::free(buf_recv, q);
+    }
+
+    PRINT_BY_ROOT(comms[0], "\n# All done\n");
+
+    transport.reset_comms();
+
+    return 0;
+}
diff --git a/examples/pt2pt/src/ccl_latency.cpp b/examples/pt2pt/src/ccl_latency.cpp
new file mode 100644
index 000000000..47fc756b6
--- /dev/null
+++ b/examples/pt2pt/src/ccl_latency.cpp
@@ -0,0 +1,156 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "base.hpp"
+#include "sycl_base.hpp"
+#include "pt2pt_transport.hpp"
+
+#include "oneapi/ccl.hpp"
+
+int main(int argc, char* argv[]) {
+    user_options_t options;
+
+    if (parse_user_options(argc, argv, options)) {
+        print_help_usage(argv[0]);
+        exit(INVALID_RETURN);
+    }
+
+    auto& transport = transport_data::instance();
+    transport.init_comms(options);
+
+    auto q = transport.get_sycl_queue();
+    auto rank = transport.get_rank();
+    auto& comms = transport.get_comms();
+    auto streams = transport.get_streams();
+
+    print_user_options("Latency", options, comms[0]);
+
+    size_t dtype_size = sizeof(ccl::datatype::int32);
+
+    for (int count = options.min_elem_count; count <= options.max_elem_count;
+         count = (count ? count * 2 : 1)) {
+        double start_t = 0.0, end_t = 0.0, diff_t = 0.0, total_latency_t = 0.0;
+
+        // create buffers
+        auto buf_send = sycl::malloc_device<int>(count, q);
+        auto buf_recv = sycl::malloc_device<int>(count, q);
+
+        // oneCCL spec defines attr identifiers that may be used to fill operation
+        // attribute objects. It means for every pair of op, we have to keep own unique attr
+        // because we may have conflicts between 2 different pairs with one common attr.
+        auto attr = create_attr(options.cache, count, to_string(0));
+        auto attr1 = create_attr(options.cache, count, to_string(1));
+
+        for (size_t iter_idx = 0; iter_idx < (options.warmup_iters + options.iters); iter_idx++) {
+            // init the buffer
+            auto e = q.submit([&](auto& h) {
+                h.parallel_for(count, [=](auto id) {
+                    buf_send[id] = id + iter_idx;
+                    buf_recv[id] = INVALID_VALUE;
+                });
+            });
+
+            if (options.wait) {
+                e.wait_and_throw();
+            }
+
+            if (iter_idx == options.warmup_iters - 1) {
+                // to ensure that all processes or threads have reached
+                // a certain synchronization point before proceeding time
+                // calculation
+                ccl::barrier(comms[0]);
+            }
+
+            if (rank == options.peers[0]) {
+                if (iter_idx >= options.warmup_iters) {
+                    start_t = MPI_Wtime();
+                }
+
+                auto send_event = ccl::send(buf_send,
+                                            count,
+                                            ccl::datatype::int32,
+                                            options.peers[1],
+                                            comms[0],
+                                            streams[0],
+                                            attr);
+                if (options.wait) {
+                    send_event.wait();
+                }
+
+                auto recv_event = ccl::recv(buf_recv,
+                                            count,
+                                            ccl::datatype::int32,
+                                            options.peers[1],
+                                            comms[0],
+                                            streams[0],
+                                            attr1);
+                if (options.wait) {
+                    recv_event.wait();
+                }
+
+                if (iter_idx >= options.warmup_iters) {
+                    end_t = MPI_Wtime();
+                    diff_t = end_t - start_t;
+                    total_latency_t += diff_t;
+                }
+            }
+            else if (rank == options.peers[1]) {
+                auto recv_event = ccl::recv(buf_recv,
+                                            count,
+                                            ccl::datatype::int32,
+                                            options.peers[0],
+                                            comms[0],
+                                            streams[0],
+                                            attr);
+                if (options.wait) {
+                    recv_event.wait();
+                }
+
+                auto send_event = ccl::send(buf_send,
+                                            count,
+                                            ccl::datatype::int32,
+                                            options.peers[0],
+                                            comms[0],
+                                            streams[0],
+                                            attr1);
+                if (options.wait) {
+                    send_event.wait();
+                }
+            }
+
+            if (options.validate == VALIDATE_ALL_ITERS ||
+                (options.validate == VALIDATE_LAST_ITER &&
+                 iter_idx == (options.warmup_iters + options.iters) - 1)) {
+                ccl::barrier(comms[0]);
+                check_buffers(q, options, count, iter_idx, buf_recv);
+            }
+        }
+
+        if (rank == options.peers[0]) {
+            // test measures the round trip latency, divide by two to get the one-way latency
+            double average_t = (total_latency_t * 1e6) / (2.0 * options.iters);
+            print_timings(comms[0], options, average_t, count * dtype_size, "#usec(latency)");
+        }
+
+        sycl::free(buf_send, q);
+        sycl::free(buf_recv, q);
+    }
+
+    PRINT_BY_ROOT(comms[0], "\n# All done\n");
+
+    transport.reset_comms();
+
+    return 0;
+}
diff --git a/include/oneapi/ccl/lp_types.hpp b/include/oneapi/ccl/lp_types.hpp
index 2d05ebbf0..993bc99fa 100644
--- a/include/oneapi/ccl/lp_types.hpp
+++ b/include/oneapi/ccl/lp_types.hpp
@@ -18,6 +18,7 @@
 #include <sstream>
 #include <cstring>
 #include <string>
+#include <cstdint>
 
 namespace ccl {
 
diff --git a/man/OneCCL.md b/man/OneCCL.md
index 8ba24ec61..77163c84e 100644
--- a/man/OneCCL.md
+++ b/man/OneCCL.md
@@ -256,6 +256,44 @@ Note: REDUCE_SCATTER algorithm does not support yet the CCL_REDUCE_SCATTER_SCALE
 By-default: &quot;direct&quot; 
         
 
+## CCL_RECV
+
+
+Set recv algorithm. 
+        
+
+
+RECV algorithms
+ - direct Using prepost(d2h-h2d) copies to get host buffers to invoke mpi/ofi-&gt;recv()
+
+ - topo Topo scale-up algorithm (available if sycl and l0 are enabled)
+
+ - offload Using device buffers directly into mpi/ofi layer skipping prepost copies d2h h2d. By-default used for scale-out. Setting extra MPI env vars for getting better performance (available if sycl and l0 are enabled)
+
+
+
+By-default: &quot;topo&quot; if sycl and l0 are enabled, otherwise offload for ofi/mpi transport 
+        
+
+## CCL_SEND
+
+
+Set send algorithm. 
+        
+
+
+SEND algorithms
+ - direct Using prepost(d2h-h2d) copies to get host buffers to invoke mpi/ofi-&gt;send()
+
+ - topo Topo scale-up algorithm (available if sycl and l0 are enabled)
+
+ - offload Using device buffers directly into mpi/ofi layer skipping prepost copies d2h h2d. By-default used for scale-out. Setting extra MPI env vars for getting better performance (available if sycl and l0 are enabled)
+
+
+
+By-default: &quot;topo&quot; if sycl and l0 are enabled, otherwise offload for ofi/mpi transport 
+        
+
 ## CCL_ALLGATHERV_SCALEOUT
 
 
@@ -389,6 +427,27 @@ Set to specify minimum number of bytes in chunk for reduce_scatter phase in ring
 By-default: &quot;65536&quot; 
         
 
+## CCL_REDUCE_SCATTER_TOPO_READ
+
+
+Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers. 
+        
+
+
+Syntax CCL_REDUCE_SCATTER_TOPO_READ=&quot;&lt;value&gt;&quot;
+Arguments
+&quot;&lt;value&gt;&quot; Description
+ - 1 Uses read based copy to transfer data across GPUs for the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives (default).
+
+ - 0 Uses write based copy to transfer data across GPUs for the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives.
+
+
+
+Description
+Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers.
+By-default: &quot;1&quot; 
+        
+
 ## CCL_REDUCE_SCATTER_MONOLITHIC_KERNEL
 
 
@@ -453,6 +512,94 @@ Set this environment variable to enable compute kernels for Alltoall and Alltoal
 By-default: &quot;1&quot; 
         
 
+## CCL_ALLGATHERV_PIPE_CHUNK_COUNT
+
+
+Set this environment variable to enable pipelining implementation for Allgatherv collectives using device (GPU) buffers. 
+        
+
+
+Syntax
+CCL_ALLGATHERV_PIPE_CHUNK_COUNT=&quot;&lt;value&gt;&quot; Arguments
+&quot;&lt;value&gt;&quot; Description
+ - 0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+
+ - 1: Calls the pipelining code with a single chunk. Effectively, it has identical behavior and performance as with &quot;0&quot;, but exercises the chunking code path with a single chunk.
+
+
+
+2 or higher: Divides the message into as many logical parts, or chunks, as specified. Then, it executes the collective with each logical chunk. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource. Effectively, this should increase performance.
+Description
+Set this environment variable to enable control how many chunks are used for Allgatherv, pipeline-based collectives using device (GPU) buffers.
+By-default: &quot;0&quot; 
+        
+
+## CCL_ALLREDUCE_PIPE_CHUNK_COUNT
+
+
+Set this environment variable to enable pipelining implementation for Allreduce collectives using device (GPU) buffers. 
+        
+
+
+Syntax
+CCL_ALLREDUCE_PIPE_CHUNK_COUNT=&quot;&lt;value&gt;&quot; Arguments
+&quot;&lt;value&gt;&quot; Description
+ - 0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+
+ - 1: Calls the pipelining code with a single chunk. Effectively, it has identical behavior and performance as with &quot;0&quot;, but exercises the chunking code path with a single chunk.
+
+
+
+2 or higher: Divides the message into as many logical parts, or chunks, as specified. Then, it executes the collective with each logical chunk. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource. Effectively, this should increase performance.
+Description
+Set this environment variable to enable control how many chunks are used for Allreduce pipeline-based collectives using device (GPU) buffers.
+By-default: &quot;0&quot; 
+        
+
+## CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT
+
+
+Set this environment variable to enable pipelining implementation for Reduce_Scatter collectives using device (GPU) buffers. 
+        
+
+
+Syntax
+CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT=&quot;&lt;value&gt;&quot; Arguments
+&quot;&lt;value&gt;&quot; Description
+ - 0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+
+ - 1: Calls the pipelining code with a single chunk. Effectively, it has identical behavior and performance as with &quot;0&quot;, but exercises the chunking code path with a single chunk.
+
+
+
+2 or higher: Divides the message into as many logical parts, or chunks, as specified. Then, it executes the collective with each logical chunk. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource. Effectively, this should increase performance.
+Description
+Set this environment variable to enable control how many chunks are used for Reduce_Scatter pipeline-based collectives using device (GPU) buffers.
+By-default: &quot;0&quot; 
+        
+
+## CCL_REDUCE_PIPE_CHUNK_COUNT
+
+
+Set this environment variable to enable pipelining implementation for Reduce collectives using device (GPU) buffers. 
+        
+
+
+Syntax
+CCL_REDUCE_PIPE_CHUNK_COUNT=&quot;&lt;value&gt;&quot; Arguments
+&quot;&lt;value&gt;&quot; Description
+ - 0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+
+ - 1: Calls the pipelining code with a single chunk. Effectively, it has identical behavior and performance as with &quot;0&quot;, but exercises the chunking code path with a single chunk.
+
+
+
+2 or higher: Divides the message into as many logical parts, or chunks, as specified. Then, it executes the collective with each logical chunk. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource. Effectively, this should increase performance.
+Description
+Set this environment variable to enable control how many chunks are used for Reduce pipeline-based collectives using device (GPU) buffers.
+By-default: &quot;0&quot; 
+        
+
 ## CCL_LOCAL_RANK
 
 
@@ -519,6 +666,30 @@ Set this environment variable to specify the job launcher to use.
 By-default: &quot;hydra&quot; 
         
 
+## CCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK
+
+
+Set to enable oversubscription in topo fallback stage for all collectives. 
+        
+
+
+This enviroment variable enables or disables the oversubscription fallback from topo algorithm to copy in/out
+&quot;&lt;value&gt;&quot; : &quot;0&quot;, &quot;1&quot;
+By-default: &quot;1&quot; 
+        
+
+## CCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW
+
+
+Set to enable oversubscription throw for all collectives. 
+        
+
+
+This enviroment variable enables or disables the oversubscription throw check
+&quot;&lt;value&gt;&quot; : &quot;0&quot;, &quot;1&quot;
+By-default: &quot;1&quot; 
+        
+
 
     
 
@@ -589,6 +760,44 @@ Use number of ports to detect the 12 ports system and use write protocols on suc
 By-default: &quot;1&quot; 
         
 
+## CCL_ZE_PT2PT_READ
+
+
+Enable switching of read and write protocols for pt2pt topo algorithm. 
+        
+
+
+Control pt2pt read/write protocols.<br />
+ Read Protocol:<br />
+ It means SEND side is exchanging the handle with RECV side. Then execute the copy operation on the RECV operation side, where the dst buf is the local buffer and the source buffer is the remote buffer.<br />
+ Write Protocol:<br />
+ it means RECV side is exchanging the handle with SEND side. Execute the copy operation on the SEND operation side, where the dst buf is the remote buffer and the source buffer is the local buffer. <br />
+ &quot;&lt;value&gt;&quot; : &quot;0&quot;, &quot;1&quot; <br />
+ By-default: &quot;1&quot; 
+        
+
+## CCL_ZE_TYPE2_TUNE_PORTS
+
+
+Tunable value for collectives to adjust copy engine indexes. 
+        
+
+
+use 2,4,6 copy engine indexes for host with 6 ports for allreduce, reduce and allgatherv &quot;&lt;value&gt;&quot;: &quot;on&quot; - always use write mode with calculated indexes &quot;off&quot; - always disabled &quot;detected&quot; - determined by the logic in detection &quot;undetected&quot; - the default value, used before the logic in detection
+By-default: &quot;undetected&quot; 
+        
+
+## CCL_BARRIER_SYNC
+
+
+Switch ccl::barrier() host-sync / host-async options. 
+        
+
+
+Historically ccl::barrier() was always synchronous. That does not match with oneCCL asynchronous concept. Same as other collectives, ccl::barrier() should be host-asynchronous if possible. As it would be too much to change in one moment, we start through experimental variable which introduces the option to make barrier host-asynchronous. Use CCL_BARRIER_SYNC=0 to achieve that.
+By-default: &quot;1 (SYNC)&quot; 
+        
+
 
 Experimental OneCCL Environment Variables Functionality of these variables has not been (fully) tested and, therefore, cannot be supported nor guaranteed. 
     
diff --git a/man/README.md b/man/README.md
new file mode 100644
index 000000000..ca5b6e750
--- /dev/null
+++ b/man/README.md
@@ -0,0 +1,15 @@
+# How to generate documentation:
+
+## Environment preparation
+
+	1. Make sure the following packages are installed:
+		- doxygen
+		- python3
+	2. Install python deps: `python3 -m pip install -r requirements.txt`
+
+## Doc generation
+
+	1. Generate docs by calling `./merge_docs.sh`
+	2. The following files should be re-generated:
+		- man3/OneCCL.3
+		- OneCCL.md
diff --git a/man/doxconfig b/man/doxconfig
index ccab3e245..ae1aa51dd 100644
--- a/man/doxconfig
+++ b/man/doxconfig
@@ -1,5 +1,5 @@
 PROJECT_NAME           = "Intel® oneAPI Collective Communications Library"
-PROJECT_NUMBER         = "2021.10"
+PROJECT_NUMBER         = "2021.11"
 
 INPUT = ../src/common/env/vars.hpp ../src/common/env/vars_experimental.hpp
 
diff --git a/man/man3/OneCCL.3 b/man/man3/OneCCL.3
index c9ec6c9ca..e0ba0230b 100644
--- a/man/man3/OneCCL.3
+++ b/man/man3/OneCCL.3
@@ -1,4 +1,4 @@
-.TH "OneCCLvars" 3 "Tue Jun 6 2023" "Version 2021.10" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
+.TH "OneCCLvars" 3 "Wed Aug 30 2023" "Version 2021.11" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
 .ad l
 .nh
 .SH NAME
@@ -66,6 +66,14 @@ OneCCLvars \- OneCCL Environment Variables
 .br
 .RI "Set reduce-scatter algorithm\&. "
 .ti -1c
+.RI "\fBCCL_RECV\fP"
+.br
+.RI "Set recv algorithm\&. "
+.ti -1c
+.RI "\fBCCL_SEND\fP"
+.br
+.RI "Set send algorithm\&. "
+.ti -1c
 .RI "\fBCCL_ALLGATHERV_SCALEOUT\fP"
 .br
 .RI "Set scaleout allgatherv algorithm\&. "
@@ -94,6 +102,10 @@ OneCCLvars \- OneCCL Environment Variables
 .br
 .RI "Set to specify minimum number of bytes in chunk for reduce_scatter phase in ring allreduce\&. "
 .ti -1c
+.RI "\fBCCL_REDUCE_SCATTER_TOPO_READ\fP"
+.br
+.RI "Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers\&. "
+.ti -1c
 .RI "\fBCCL_REDUCE_SCATTER_MONOLITHIC_KERNEL\fP"
 .br
 .RI "Set this environment variable to enable compute kernels for Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers\&. "
@@ -106,6 +118,22 @@ OneCCLvars \- OneCCL Environment Variables
 .br
 .RI "Set this environment variable to enable compute kernels for Alltoall and Alltoallv collectives using device (GPU) buffers\&. "
 .ti -1c
+.RI "\fBCCL_ALLGATHERV_PIPE_CHUNK_COUNT\fP"
+.br
+.RI "Set this environment variable to enable pipelining implementation for Allgatherv collectives using device (GPU) buffers\&. "
+.ti -1c
+.RI "\fBCCL_ALLREDUCE_PIPE_CHUNK_COUNT\fP"
+.br
+.RI "Set this environment variable to enable pipelining implementation for Allreduce collectives using device (GPU) buffers\&. "
+.ti -1c
+.RI "\fBCCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT\fP"
+.br
+.RI "Set this environment variable to enable pipelining implementation for Reduce_Scatter collectives using device (GPU) buffers\&. "
+.ti -1c
+.RI "\fBCCL_REDUCE_PIPE_CHUNK_COUNT\fP"
+.br
+.RI "Set this environment variable to enable pipelining implementation for Reduce collectives using device (GPU) buffers\&. "
+.ti -1c
 .RI "\fBCCL_LOCAL_RANK\fP"
 .br
 .RI "Set this environment variable to specify the rank number of the current process in the local host\&. "
@@ -117,6 +145,14 @@ OneCCLvars \- OneCCL Environment Variables
 .RI "\fBCCL_PROCESS_LAUNCHER\fP"
 .br
 .RI "Set this environment variable to specify the job launcher to use\&. "
+.ti -1c
+.RI "\fBCCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK\fP"
+.br
+.RI "Set to enable oversubscription in topo fallback stage for all collectives\&. "
+.ti -1c
+.RI "\fBCCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW\fP"
+.br
+.RI "Set to enable oversubscription throw for all collectives\&. "
 .in -1c
 .SH "Detailed Description"
 .PP 
@@ -161,6 +197,27 @@ Description
 Set this environment variable to enable compute kernels for Allgather collectives using device (GPU) buffers
 .PP
 By-default: '0' 
+.SS "CCL_ALLGATHERV_PIPE_CHUNK_COUNT"
+
+.PP
+Set this environment variable to enable pipelining implementation for Allgatherv collectives using device (GPU) buffers\&. Syntax
+.PP
+CCL_ALLGATHERV_PIPE_CHUNK_COUNT='<value>' Arguments
+.PP
+'<value>' Description
+.IP "\(bu" 2
+0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+.IP "\(bu" 2
+1: Calls the pipelining code with a single chunk\&. Effectively, it has identical behavior and performance as with '0', but exercises the chunking code path with a single chunk\&.
+.PP
+.PP
+2 or higher: Divides the message into as many logical parts, or chunks, as specified\&. Then, it executes the collective with each logical chunk\&. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource\&. Effectively, this should increase performance\&.
+.PP
+Description
+.PP
+Set this environment variable to enable control how many chunks are used for Allgatherv, pipeline-based collectives using device (GPU) buffers\&.
+.PP
+By-default: '0' 
 .SS "CCL_ALLGATHERV_SCALEOUT"
 
 .PP
@@ -201,6 +258,27 @@ topo Topo scaleup algorithm (available if sycl and l0 are enabled)
 .PP
 .PP
 By-default: 'topo', if sycl and l0 are enable, otherwise 'ring' 
+.SS "CCL_ALLREDUCE_PIPE_CHUNK_COUNT"
+
+.PP
+Set this environment variable to enable pipelining implementation for Allreduce collectives using device (GPU) buffers\&. Syntax
+.PP
+CCL_ALLREDUCE_PIPE_CHUNK_COUNT='<value>' Arguments
+.PP
+'<value>' Description
+.IP "\(bu" 2
+0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+.IP "\(bu" 2
+1: Calls the pipelining code with a single chunk\&. Effectively, it has identical behavior and performance as with '0', but exercises the chunking code path with a single chunk\&.
+.PP
+.PP
+2 or higher: Divides the message into as many logical parts, or chunks, as specified\&. Then, it executes the collective with each logical chunk\&. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource\&. Effectively, this should increase performance\&.
+.PP
+Description
+.PP
+Set this environment variable to enable control how many chunks are used for Allreduce pipeline-based collectives using device (GPU) buffers\&.
+.PP
+By-default: '0' 
 .SS "CCL_ALLREDUCE_SCALEOUT"
 
 .PP
@@ -428,6 +506,19 @@ Description
 Set this environment variable to specify the job launcher to use\&.
 .PP
 By-default: 'hydra' 
+.SS "CCL_RECV"
+
+.PP
+Set recv algorithm\&. RECV algorithms
+.IP "\(bu" 2
+direct Using prepost(d2h-h2d) copies to get host buffers to invoke mpi/ofi->recv()
+.IP "\(bu" 2
+topo Topo scale-up algorithm (available if sycl and l0 are enabled)
+.IP "\(bu" 2
+offload Using device buffers directly into mpi/ofi layer skipping prepost copies d2h h2d\&. By-default used for scale-out\&. Setting extra MPI env vars for getting better performance (available if sycl and l0 are enabled)
+.PP
+.PP
+By-default: 'topo' if sycl and l0 are enabled, otherwise offload for ofi/mpi transport 
 .SS "CCL_REDUCE"
 
 .PP
@@ -447,6 +538,27 @@ topo Topo scaleup algorithm (available if sycl and l0 are enabled)
 .PP
 .PP
 By-default: 'topo' if sycl and l0 are enabled, otherwise tree for ofi transport or direct for mpi 
+.SS "CCL_REDUCE_PIPE_CHUNK_COUNT"
+
+.PP
+Set this environment variable to enable pipelining implementation for Reduce collectives using device (GPU) buffers\&. Syntax
+.PP
+CCL_REDUCE_PIPE_CHUNK_COUNT='<value>' Arguments
+.PP
+'<value>' Description
+.IP "\(bu" 2
+0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+.IP "\(bu" 2
+1: Calls the pipelining code with a single chunk\&. Effectively, it has identical behavior and performance as with '0', but exercises the chunking code path with a single chunk\&.
+.PP
+.PP
+2 or higher: Divides the message into as many logical parts, or chunks, as specified\&. Then, it executes the collective with each logical chunk\&. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource\&. Effectively, this should increase performance\&.
+.PP
+Description
+.PP
+Set this environment variable to enable control how many chunks are used for Reduce pipeline-based collectives using device (GPU) buffers\&.
+.PP
+By-default: '0' 
 .SS "CCL_REDUCE_SCALEOUT"
 
 .PP
@@ -498,6 +610,46 @@ Description
 Set this environment variable to enable compute kernels for Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers
 .PP
 By-default: '0' 
+.SS "CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT"
+
+.PP
+Set this environment variable to enable pipelining implementation for Reduce_Scatter collectives using device (GPU) buffers\&. Syntax
+.PP
+CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT='<value>' Arguments
+.PP
+'<value>' Description
+.IP "\(bu" 2
+0: (default) Bypasses the chunking/pipelining code and directly calls the topology-aware code
+.IP "\(bu" 2
+1: Calls the pipelining code with a single chunk\&. Effectively, it has identical behavior and performance as with '0', but exercises the chunking code path with a single chunk\&.
+.PP
+.PP
+2 or higher: Divides the message into as many logical parts, or chunks, as specified\&. Then, it executes the collective with each logical chunk\&. This should allow for several phases of the algorithm to run in parallel, as long as they don't use the same physical resource\&. Effectively, this should increase performance\&.
+.PP
+Description
+.PP
+Set this environment variable to enable control how many chunks are used for Reduce_Scatter pipeline-based collectives using device (GPU) buffers\&.
+.PP
+By-default: '0' 
+.SS "CCL_REDUCE_SCATTER_TOPO_READ"
+
+.PP
+Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers\&. Syntax CCL_REDUCE_SCATTER_TOPO_READ='<value>'
+.PP
+Arguments
+.PP
+'<value>' Description
+.IP "\(bu" 2
+1 Uses read based copy to transfer data across GPUs for the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives (default)\&.
+.IP "\(bu" 2
+0 Uses write based copy to transfer data across GPUs for the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives\&.
+.PP
+.PP
+Description
+.PP
+Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers\&.
+.PP
+By-default: '1' 
 .SS "CCL_RS_CHUNK_COUNT"
 
 .PP
@@ -510,6 +662,19 @@ By-default: '1'
 Set to specify minimum number of bytes in chunk for reduce_scatter phase in ring allreduce\&. '<size>' - Minimum number of bytes in chunk for reduce_scatter phase in ring allreduce\&. Affects actual value of CCL_RS_CHUNK_COUNT\&.
 .PP
 By-default: '65536' 
+.SS "CCL_SEND"
+
+.PP
+Set send algorithm\&. SEND algorithms
+.IP "\(bu" 2
+direct Using prepost(d2h-h2d) copies to get host buffers to invoke mpi/ofi->send()
+.IP "\(bu" 2
+topo Topo scale-up algorithm (available if sycl and l0 are enabled)
+.IP "\(bu" 2
+offload Using device buffers directly into mpi/ofi layer skipping prepost copies d2h h2d\&. By-default used for scale-out\&. Setting extra MPI env vars for getting better performance (available if sycl and l0 are enabled)
+.PP
+.PP
+By-default: 'topo' if sycl and l0 are enabled, otherwise offload for ofi/mpi transport 
 .SS "CCL_WORKER_AFFINITY"
 
 .PP
@@ -538,10 +703,26 @@ Set to specify memory affinity for oneCCL worker threads\&.
  A comma-separated list of NUMA node numbers for all local workers, one number per worker\&. The i-th local worker is pinned to the i-th NUMA node in the list\&. The number should not exceed the number of NUMA nodes available on the system\&.
 .PP
 By-default: 'not-specified' 
+.SS "CCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK"
+
+.PP
+Set to enable oversubscription in topo fallback stage for all collectives\&. This enviroment variable enables or disables the oversubscription fallback from topo algorithm to copy in/out
+.PP
+'<value>' : '0', '1'
+.PP
+By-default: '1' 
+.SS "CCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW"
+
+.PP
+Set to enable oversubscription throw for all collectives\&. This enviroment variable enables or disables the oversubscription throw check
+.PP
+'<value>' : '0', '1'
+.PP
+By-default: '1' 
 .SH "Author"
 .PP 
 Generated automatically by Doxygen for Intel® oneAPI Collective Communications Library from the source code\&.
-.TH "ExpOneCCLvars" 3 "Tue Jun 6 2023" "Version 2021.10" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
+.TH "ExpOneCCLvars" 3 "Wed Aug 30 2023" "Version 2021.11" "Intel® oneAPI Collective Communications Library" \" -*- nroff -*-
 .ad l
 .nh
 .SH NAME
@@ -575,6 +756,18 @@ ExpOneCCLvars \- Experimental OneCCL Environment Variables
 .RI "\fBCCL_ZE_AUTO_TUNE_PORTS\fP"
 .br
 .RI "Automatically tune algorithm protocols based on port count\&. "
+.ti -1c
+.RI "constexpr const char * \fBCCL_ZE_PT2PT_READ\fP = 'CCL_ZE_PT2PT_READ'"
+.br
+.RI "Enable switching of read and write protocols for pt2pt topo algorithm\&. "
+.ti -1c
+.RI "constexpr const char * \fBCCL_ZE_TYPE2_TUNE_PORTS\fP = 'CCL_ZE_TYPE2_TUNE_PORTS'"
+.br
+.RI "Tunable value for collectives to adjust copy engine indexes\&. "
+.ti -1c
+.RI "\fBCCL_BARRIER_SYNC\fP"
+.br
+.RI "Switch ccl::barrier() host-sync / host-async options\&. "
 .in -1c
 .SH "Detailed Description"
 .PP 
@@ -583,6 +776,12 @@ Experimental OneCCL Environment Variables Functionality of these variables has n
 
 .SH "Variable Documentation"
 .PP 
+.SS "CCL_BARRIER_SYNC"
+
+.PP
+Switch ccl::barrier() host-sync / host-async options\&. Historically ccl::barrier() was always synchronous\&. That does not match with oneCCL asynchronous concept\&. Same as other collectives, ccl::barrier() should be host-asynchronous if possible\&. As it would be too much to change in one moment, we start through experimental variable which introduces the option to make barrier host-asynchronous\&. Use CCL_BARRIER_SYNC=0 to achieve that\&.
+.PP
+By-default: '1 (SYNC)' 
 .SS "CCL_REDUCE_SCATTER_FALLBACK_ALGO"
 
 .PP
@@ -629,6 +828,28 @@ Set to specify the mechanism to use for Level Zero IPC exchange\&.
 '<value>': 'drmfd', 'pidfd', 'sockets'
 .PP
 By-default: 'drmfd' 
+.SS "constexpr const char* CCL_ZE_PT2PT_READ = 'CCL_ZE_PT2PT_READ'\fC [constexpr]\fP"
+
+.PP
+Enable switching of read and write protocols for pt2pt topo algorithm\&. Control pt2pt read/write protocols\&.
+.br
+ Read Protocol:
+.br
+ It means SEND side is exchanging the handle with RECV side\&. Then execute the copy operation on the RECV operation side, where the dst buf is the local buffer and the source buffer is the remote buffer\&.
+.br
+ Write Protocol:
+.br
+ it means RECV side is exchanging the handle with SEND side\&. Execute the copy operation on the SEND operation side, where the dst buf is the remote buffer and the source buffer is the local buffer\&. 
+.br
+ '<value>' : '0', '1' 
+.br
+ By-default: '1' 
+.SS "constexpr const char* CCL_ZE_TYPE2_TUNE_PORTS = 'CCL_ZE_TYPE2_TUNE_PORTS'\fC [constexpr]\fP"
+
+.PP
+Tunable value for collectives to adjust copy engine indexes\&. use 2,4,6 copy engine indexes for host with 6 ports for allreduce, reduce and allgatherv '<value>': 'on' - always use write mode with calculated indexes 'off' - always disabled 'detected' - determined by the logic in detection 'undetected' - the default value, used before the logic in detection
+.PP
+By-default: 'undetected' 
 .SH "Author"
 .PP 
 Generated automatically by Doxygen for Intel® oneAPI Collective Communications Library from the source code\&.
diff --git a/man/merge_doc.sh b/man/merge_doc.sh
index 0c7eb5930..7619c9ff8 100755
--- a/man/merge_doc.sh
+++ b/man/merge_doc.sh
@@ -40,6 +40,13 @@ check_file_exists() {
   fi
 }
 
+check_program_exists() {
+  if ! which "$1";  then
+    echo "Error: $1 not found." >&2
+    exit 1
+  fi
+}
+
 # Define a function to print help message
 print_help() {
   echo "Description: This script extracts information from two XML files, converts it to Markdown format, and combines it into"
@@ -78,6 +85,10 @@ extract_xml_info() {
   $SCRIPT_DIR/doxy_to_md.py $1 > $2
 }
 
+# generate docs
+check_program_exists doxygen
+doxygen "$SCRIPT_DIR/doxconfig"
+
 # Combine two man files into one
 check_file_exists "$MANMainOneCCLvars"
 check_file_exists "$MANExpOneCCLvars"
diff --git a/pkgconfig/oneapi/cpu/template.pc b/pkgconfig/oneapi/cpu/template.pc
new file mode 100644
index 000000000..f6867220f
--- /dev/null
+++ b/pkgconfig/oneapi/cpu/template.pc
@@ -0,0 +1,13 @@
+
+prefix=${pcfiledir}/../../
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib/ccl/cpu/lib
+includedir=${prefix}/include
+
+Name: oneAPI Collective Communications Library (oneCCL)
+Description: oneCCL provides an efficient implementation of communication patterns used in deep learning.
+URL: https://github.com/oneapi-src/oneCCL
+Version: CCL_SUBSTITUTE_OFFICIAL_VERSION
+Requires: impi
+Libs: -L${libdir} -lccl
+Cflags: -I${includedir}
diff --git a/pkgconfig/oneapi/template.pc b/pkgconfig/oneapi/template.pc
new file mode 100644
index 000000000..f4baf617d
--- /dev/null
+++ b/pkgconfig/oneapi/template.pc
@@ -0,0 +1,12 @@
+prefix=${pcfiledir}/../../
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib/
+includedir=${prefix}/include/
+
+Name: oneAPI Collective Communications Library (oneCCL)
+Description: oneCCL provides an efficient implementation of communication patterns used in deep learning.
+URL: https://github.com/oneapi-src/oneCCL
+Version: CCL_SUBSTITUTE_OFFICIAL_VERSION
+Requires: impi
+Libs: -L${libdir} -lccl @OTHER_FLAGS@
+Cflags: -I${includedir}
diff --git a/pkgconfig/template.pc b/pkgconfig/template.pc
index 85b74168f..5577f7957 100755
--- a/pkgconfig/template.pc
+++ b/pkgconfig/template.pc
@@ -1,4 +1,3 @@
-#
 prefix=${pcfiledir}/../../
 exec_prefix=${prefix}
 libdir=${exec_prefix}/lib/@BUILD_TYPE@
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1aa6e22f4..56a47e707 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -30,7 +30,8 @@ if (CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
         sched/entry/ze/ze_alltoallv_entry.cpp
         sched/entry/ze/ze_base_entry.cpp
         sched/entry/ze/ze_barrier_entry.cpp
-        sched/entry/ze/ze_cache.cpp
+        sched/entry/ze/cache/ze_cache.cpp
+        sched/entry/ze/cache/ze_device_cache.cpp
         sched/entry/ze/ze_call.cpp
         sched/entry/ze/ze_command.cpp
         sched/entry/ze/ze_copy_entry.cpp
@@ -38,8 +39,10 @@ if (CCL_ENABLE_SYCL AND CCL_ENABLE_ZE)
         sched/entry/ze/ze_kernel.cpp
         sched/entry/ze/ze_event_signal_entry.cpp
         sched/entry/ze/ze_event_wait_entry.cpp
+        sched/entry/ze/ze_membarrier_entry.cpp
         sched/entry/ze/ze_onesided_reduce_entry.cpp
         sched/entry/ze/ze_primitives.cpp
+        sched/entry/ze/ze_pt2pt_barrier_entry.cpp
         sched/entry/ze/ze_reduce_local_entry.cpp
 
         common/global/ze/ze_data.cpp
@@ -100,8 +103,10 @@ set(CCL_SRC
     coll/algorithms/barrier.cpp
     coll/algorithms/bcast.cpp
     coll/algorithms/double_tree_ops.cpp
+    coll/algorithms/recv.cpp
     coll/algorithms/reduce.cpp
     coll/algorithms/reduce_scatter.cpp
+    coll/algorithms/send.cpp
     coll/coll.cpp
     coll/coll_check.cpp
     coll/selection/selection.cpp
@@ -111,8 +116,10 @@ set(CCL_SRC
     coll/selection/selector_alltoallv.cpp
     coll/selection/selector_barrier.cpp
     coll/selection/selector_bcast.cpp
+    coll/selection/selector_recv.cpp
     coll/selection/selector_reduce.cpp
     coll/selection/selector_reduce_scatter.cpp
+    coll/selection/selector_send.cpp
 
     comm/atl_tag.cpp
     comm/comm.cpp
@@ -133,6 +140,7 @@ set(CCL_SRC
     common/utils/exchange_utils.cpp
     common/utils/fd_info.cpp
     common/utils/memcpy.cpp
+    common/utils/profile.cpp
     common/utils/spinlock.cpp
     common/utils/utils.cpp
     common/utils/version.cpp
@@ -351,7 +359,7 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/include/
 if ("${LIBFABRIC_DIR}" STREQUAL "")
     # internal libfabric is used, install it into package
     install(DIRECTORY ${DEPS_DIR}/ofi/lib/
-            DESTINATION ${CCL_INSTALL_LIB})
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/opt/mpi/libfabric/lib)
 endif()
 
 if (ENABLE_MPI)
@@ -361,8 +369,8 @@ if (ENABLE_MPI)
             DESTINATION ${CCL_INSTALL_INCLUDE})
     install(DIRECTORY ${DEPS_DIR}/mpi/lib/
            DESTINATION ${CCL_INSTALL_LIB})
-    install(DIRECTORY ${DEPS_DIR}/mpi/etc/
-            DESTINATION ${CCL_INSTALL_ETC})
+    install(DIRECTORY ${DEPS_DIR}/mpi/opt/mpi/etc/
+            DESTINATION ${CMAKE_INSTALL_PREFIX}/opt/mpi/etc/)
     install(DIRECTORY ${DEPS_DIR}/mpi/licensing/
             DESTINATION ${CCL_INSTALL_LICENSE}/mpi/)
 endif()
diff --git a/src/atl/atl_base_comm.cpp b/src/atl/atl_base_comm.cpp
index 4e2f44f17..206e66a65 100644
--- a/src/atl/atl_base_comm.cpp
+++ b/src/atl/atl_base_comm.cpp
@@ -23,6 +23,7 @@
 #include "atl/ofi/atl_ofi.hpp"
 #include "atl/util/pm/pm_rt.h"
 #include "common/utils/utils.hpp"
+#include "comm/atl_tag.hpp"
 #include "exec/exec.hpp"
 
 atl_attr_t atl_base_comm::attr = {
@@ -137,8 +138,32 @@ int atl_base_comm::create_comm_id() {
 }
 
 void atl_base_comm::init_tag() {
-    tag_creator =
-        std::shared_ptr<ccl_atl_tag>(new ccl_atl_tag(attr.out.tag_bits, attr.out.max_tag));
+    auto transport_type = ccl::global_data::env().atl_transport;
+
+    size_t tag_bits = attr.out.tag_bits;
+    size_t max_tag = attr.out.max_tag;
+
+    switch (transport_type) {
+        case ccl_atl_ofi:
+            if (tag_bits == tag_layout::cxi) {
+                tag_creator = std::shared_ptr<ccl_atl_tag>(
+                    new ccl_atl_tag_impl<ofi_cxi_tag_layout>(tag_bits, max_tag));
+            }
+            else {
+                tag_creator = std::shared_ptr<ccl_atl_tag>(
+                    new ccl_atl_tag_impl<common_tag_layout>(tag_bits, max_tag));
+            }
+            break;
+#ifdef CCL_ENABLE_MPI
+        case ccl_atl_mpi:
+            CCL_THROW_IF_NOT(max_tag >= mpi_tag_layout::op_id_mask + mpi_tag_layout::sched_id_mask,
+                             "sched_id and op_id have to be encoded uncut for MPI case");
+            tag_creator = std::shared_ptr<ccl_atl_tag>(
+                new ccl_atl_tag_impl<mpi_tag_layout>(tag_bits, max_tag));
+            break;
+#endif // CCL_ENABLE_MPI
+        default: LOG_ERROR("unsupported tag type"); break;
+    }
     if (rank == 0) {
         LOG_DEBUG("atl tag: ", tag_creator->to_string());
     }
diff --git a/src/atl/atl_def.h b/src/atl/atl_def.h
index fc38ce526..c67d84e56 100644
--- a/src/atl/atl_def.h
+++ b/src/atl/atl_def.h
@@ -68,6 +68,14 @@
         } \
     } while (0)
 
+#define ATL_CHECK_PTR(ptr, str) \
+    do { \
+        if (!ptr) { \
+            LOG_ERROR("%s, errno: %s", str, strerror(errno)); \
+            return ATL_STATUS_FAILURE; \
+        } \
+    } while (0)
+
 #define ATL_SET_STR(dst, size, ...) \
     do { \
         if (snprintf(dst, size, __VA_ARGS__) > size) { \
diff --git a/src/atl/mpi/atl_mpi.cpp b/src/atl/mpi/atl_mpi.cpp
index c26a72348..505354158 100644
--- a/src/atl/mpi/atl_mpi.cpp
+++ b/src/atl/mpi/atl_mpi.cpp
@@ -99,6 +99,7 @@ atl_status_t atl_mpi::init(int* argc,
     attr->out.mnic_type = ctx.mnic_type;
     attr->out.mnic_count = ctx.mnic_count;
     attr->out.tag_bits = 32;
+    // MPI specification requires the user tag to be minimum 16 bits.
     attr->out.max_tag = (is_tag_ub_set) ? *((int*)tag_ub_ptr) : 0;
     attr->out.max_order_waw_size = 0;
 
diff --git a/src/atl/mpi/atl_mpi_ctx.cpp b/src/atl/mpi/atl_mpi_ctx.cpp
index 06dcc6993..d55968d66 100644
--- a/src/atl/mpi/atl_mpi_ctx.cpp
+++ b/src/atl/mpi/atl_mpi_ctx.cpp
@@ -557,15 +557,12 @@ atl_status_t atl_mpi_ctx::set_impi_env(const atl_attr_t& attr, const atl_mpi_lib
     setenv("I_MPI_SHM_CMA", "0", 0);
     if (attr.in.enable_hmem && lib_attr.hmem) {
         setenv("I_MPI_OFFLOAD", "2", 0);
-        setenv("I_MPI_OFFLOAD_TOPOLIB", "l0", 0);
-        setenv("I_MPI_OFFLOAD_QUEUE_CACHE", "1", 0);
-        setenv("I_MPI_OFFLOAD_LIST_CACHE", "1", 0);
-        setenv("I_MPI_OFFLOAD_MEMCPY_KIND", "blocked", 0);
         if (attr.in.ep_count > 1) {
             /* try to set global lock level before vci level
                because setenv is invoked with overwrite=0 */
             setenv("I_MPI_THREAD_LOCK_LEVEL", "global", 0);
         }
+        LOG_DEBUG("IMPI case: gpu support is enabled");
     }
 #endif // CCL_ENABLE_SYCL
 
@@ -587,18 +584,6 @@ atl_status_t atl_mpi_ctx::set_mpich_env(const atl_attr_t& attr) {
     setenv("MPIR_CVAR_CH4_OFI_MAX_VCIS", ep_count_str, 0);
     setenv("MPIR_COMM_HINT_VCI", EP_IDX_KEY, 0);
 
-    int enable_gpu = 0;
-#ifdef CCL_ENABLE_SYCL
-    if (attr.in.enable_hmem) {
-        enable_gpu = 1;
-    }
-#endif // CCL_ENABLE_SYCL
-    setenv("MPIR_CVAR_ENABLE_GPU", (enable_gpu ? "1" : "0"), 0);
-
-    if (enable_gpu) {
-        setenv("MPIR_CVAR_CH4_IPC_ZE_SHAREABLE_HANDLE", "pidfd", 0);
-    }
-
     auto& env = ccl::global_data::env();
     if (env.log_level >= ccl_log_level::debug) {
         setenv("MPIR_CVAR_CH4_RUNTIME_CONF_DEBUG", "1", 0);
@@ -621,11 +606,11 @@ atl_status_t atl_mpi_ctx::check_impi_env(const atl_attr_t& attr) {
     if (atoi(ep_count_env) != (int)(get_ep_count(attr)))
         return ATL_STATUS_FAILURE;
 
-    if (!getenv("I_MPI_ROOT")) {
+    if (!getenv("ONEAPI_ROOT") && !getenv("I_MPI_ROOT")) {
         atl_mpi_lib_type_t type = ATL_MPI_LIB_IMPI;
         LOG_ERROR("CCL/MPI uses ",
                   mpi_lib_infos[type].version_prefix_1,
-                  " but I_MPI_ROOT is not set. ",
+                  " but neither I_MPI_ROOT nor ONEAPI_ROOT is set. ",
                   "Please source ",
                   mpi_lib_infos[type].kind_value,
                   " version of ",
diff --git a/src/atl/ofi/atl_ofi.cpp b/src/atl/ofi/atl_ofi.cpp
index e2f2bac09..9e4dfeee6 100644
--- a/src/atl/ofi/atl_ofi.cpp
+++ b/src/atl/ofi/atl_ofi.cpp
@@ -73,6 +73,10 @@ atl_status_t atl_ofi::init(int* argc,
 
     ctx.ep_count = attr->in.ep_count;
 
+    if (!pmi) {
+        LOG_ERROR("pmi is null");
+        goto err;
+    }
     coord.global_count = pmi->get_size();
     coord.global_idx = pmi->get_rank();
 
@@ -130,7 +134,8 @@ atl_status_t atl_ofi::init(int* argc,
     attr->out.max_tag = 0xFFFFFFFFFFFFFFFF;
 
 #ifdef CCL_ENABLE_OFI_HMEM
-    if (prov_env && (strstr(prov_env, "verbs") || strstr(prov_env, "cxi")) &&
+    if (prov_env &&
+        (strstr(prov_env, "verbs") || strstr(prov_env, "cxi") || strstr(prov_env, "psm3")) &&
         attr->in.enable_hmem) {
         struct fi_info* hmem_hints = fi_dupinfo(base_hints);
         atl_attr_t hmem_attr = *attr;
@@ -960,7 +965,6 @@ atl_status_t atl_ofi::open_providers(char* prov_env,
     size_t prov_idx = 0;
     int enable_shm = 0;
     ssize_t ret = 0;
-    char* prov_name = nullptr;
     struct fi_info *prov_list = nullptr, *prov_hints = nullptr;
     atl_ofi_prov_t* prov = nullptr;
 
@@ -1006,17 +1010,16 @@ atl_status_t atl_ofi::open_providers(char* prov_env,
     /* open SHM provider */
     if (enable_shm) {
         prov_idx = ctx.shm_prov_idx;
-        prov_name = strdup(ATL_OFI_SHM_PROV_NAME);
         prov = &ctx.provs[prov_idx];
         prov->idx = prov_idx;
         prov->is_shm = 1;
-        ATL_CALL(atl_ofi_get_prov_list(ctx, prov_name, base_hints, &prov_list), goto err);
+        ATL_CALL(atl_ofi_get_prov_list(ctx, ATL_OFI_SHM_PROV_NAME, base_hints, &prov_list),
+                 goto err);
         if (ep_names.size() < prov->idx + 1) {
             ep_names.resize(prov->idx + 1);
         }
         ATL_CALL(atl_ofi_prov_init(ctx, coord, prov_list, prov, attr, pmi, ep_names[prov->idx]),
                  goto err);
-        free(prov_name);
         fi_freeinfo(prov_list);
         ctx.prov_count++;
     }
diff --git a/src/atl/ofi/atl_ofi.hpp b/src/atl/ofi/atl_ofi.hpp
index db98a7cae..113bdfb7c 100644
--- a/src/atl/ofi/atl_ofi.hpp
+++ b/src/atl/ofi/atl_ofi.hpp
@@ -232,7 +232,7 @@ class atl_ofi : public atl_base_transport {
         void push(size_t idx, fid_mr* mr);
 
     private:
-        int enable_hmem;
+        int enable_hmem{ 0 };
         std::vector<mr_cache> memory_regions;
     };
 
diff --git a/src/atl/ofi/atl_ofi_helper.cpp b/src/atl/ofi/atl_ofi_helper.cpp
index 1db451e94..d5b757ff4 100644
--- a/src/atl/ofi/atl_ofi_helper.cpp
+++ b/src/atl/ofi/atl_ofi_helper.cpp
@@ -277,9 +277,15 @@ atl_status_t atl_ofi_prov_update_addr_table(atl_ofi_ctx_t& ctx,
         return ATL_STATUS_FAILURE;
     }
 
-    ATL_CHECK_STATUS(pmi->pmrt_barrier(), "barrier failed");
-
+    /* variable initialization must happen before the first *goto* statement */
     std::vector<char> ret_ep_name(addr_len, '\0');
+
+    if (pmi->pmrt_barrier() != ATL_STATUS_SUCCESS) {
+        LOG_ERROR("barrier failed");
+        ret = ATL_STATUS_FAILURE;
+        goto err_ep_names;
+    }
+
     next_ep_name = ep_names_table;
     /* retrieve all OFI EP names in order */
     for (i = 0; i < coord.global_count; i++) {
@@ -822,7 +828,7 @@ atl_status_t atl_ofi_set_env(const atl_attr_t& attr) {
        to workaround issue with undefined symbols
        in case of out-of-tree providers, like OFI/PSM3
     */
-    global_data.dlhandle = dlopen("libfabric.so", RTLD_GLOBAL | RTLD_NOW);
+    global_data.dlhandle = dlopen(ccl::get_ofi_lib_path().c_str(), RTLD_GLOBAL | RTLD_NOW);
     if (global_data.dlhandle == nullptr) {
         LOG_WARN("dlopen (libfabric.so): ", dlerror());
     }
@@ -857,17 +863,15 @@ atl_status_t atl_ofi_get_prov_list(atl_ofi_ctx_t& ctx,
 
     ret = fi_getinfo(fi_version, nullptr, nullptr, 0ULL, hints, &prov_list);
 
-    if ((ret || !prov_list || !strcmp(prov_list->fabric_attr->prov_name, ATL_OFI_SHM_PROV_NAME)) &&
-        prov_list->caps & FI_HMEM) {
-        // skip OFI/SHM with HMEM capability
-        fi_freeinfo(hints);
-        fi_freeinfo(prov_list);
-        return ATL_STATUS_FAILURE;
-    }
     if (ret || !prov_list) {
         LOG_ERROR("fi_getinfo error: ret ", ret, ", providers ", (void*)prov_list);
         goto err;
     }
+    if (!strcmp(prov_list->fabric_attr->prov_name, ATL_OFI_SHM_PROV_NAME) &&
+        prov_list->caps & FI_HMEM) {
+        LOG_ERROR("skip OFI/SHM with HMEM capability");
+        goto err;
+    }
 
     if (prov_list->domain_attr->max_ep_tx_ctx > 1) {
         hints->ep_attr->tx_ctx_cnt = ctx.ep_count;
@@ -969,7 +973,7 @@ atl_status_t atl_ofi_prov_init(atl_ofi_ctx_t& ctx,
     }
 
     /* TODO: make separate function to be called on CCL comm creation */
-    ret = atl_ofi_prov_eps_connect(ctx, coord, prov->idx, pmi, ep_names);
+    ret = atl_ofi_prov_eps_connect(ctx, coord, prov->idx, std::move(pmi), ep_names);
     if (ret) {
         LOG_ERROR("atl_ofi_prov_eps_connect error, prov_idx ", prov->idx);
         goto err;
@@ -1348,9 +1352,9 @@ atl_status_t atl_ofi_open_nw_provs(atl_ofi_ctx_t& ctx,
         prov = &ctx.provs[prov_idx];
         prov->idx = prov_idx;
         prov->is_shm = 0;
-        ATL_CALL(
-            atl_ofi_prov_init(ctx, coord, final_provs[idx], prov, attr, pmi, ep_names[prov->idx]),
-            goto err);
+        ATL_CALL(atl_ofi_prov_init(
+                     ctx, coord, final_provs[idx], prov, attr, std::move(pmi), ep_names[prov->idx]),
+                 goto err);
     }
 
 exit:
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
index 64ab9987a..797ce158b 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable.h
@@ -21,6 +21,39 @@
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.hpp"
 #include "atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.hpp"
 
+// WARNING - the functionality related to this code is not documented and not tested !
+//
+// The functionality was marked in logs as *DEPRECATED*
+//
+// Some general comments with regards to the whole oneCCL, as of the moment of adding this commit:
+//
+// The following steps would have to be followed to instantiate pmi_resizable in oneCCL:
+//
+// 1.	CCL_PM_TYPE =resizable        # not tested and undocumented;
+//                                      the related code is not placed in the env module
+// 2.	CCL_ATL_TRANSPORT=ofi         # mpi transport layer does not support this functionality
+// 3.	create communicator via `ccl::preview::create_communicator()`,
+//                                      without specifying comm size: only one of our tests
+//                                      uses this function, as one of many options,
+//                                      buried within nested ifs
+//
+// There are 3 classes that implement the realated ipmi interface:
+//
+// pmi_resizable
+// pmi_resizable_simple
+// pmi_resizable_simple_internal
+//
+// but the latter two are instantiated in most cases and they don’t support the functionality:
+//
+// atl_status_t pmi_resizable_simple::pmrt_update() {
+//     return ATL_STATUS_UNSUPPORTED;
+// }
+//
+// atl_status_t pmi_resizable_simple_internal::pmrt_update() {
+//     LOG_ERROR("unsupported");
+//     return ATL_STATUS_UNSUPPORTED;
+// }
+
 #define PMIR_SUCCESS                0
 #define PMIR_FAIL                   -1
 #define PMIR_ERR_INIT               1
@@ -50,10 +83,17 @@ class pmi_resizable final : public ipmi {
     pmi_resizable() = delete;
     explicit pmi_resizable(std::shared_ptr<ikvs_wrapper> k, const char* main_addr = "")
             : main_addr(main_addr),
-              h(std::make_shared<helper>(k)) {}
+              h(std::make_shared<helper>(k)) {
+        LOG_WARN("WARNING: resizable pmi is an undocumented and deprecated functionality"
+                 " the functionality might be removed without notice");
+    }
 
     ~pmi_resizable() override;
 
+    pmi_resizable& operator=(const pmi_resizable&) = delete;
+
+    pmi_resizable(const pmi_resizable&) = delete;
+
     int is_pm_resize_enabled() override;
 
     atl_status_t pmrt_main_addr_reserve(char* addr) override;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
index 36aab26d0..f5b0795c7 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/def.h
@@ -51,11 +51,19 @@ typedef enum { KVS_STATUS_SUCCESS, KVS_STATUS_FAILURE, KVS_STATUS_UNSUPPORTED }
         } \
     } while (0)
 
-#define CHECK_FGETS(expr, str) \
+#define CHECK_FGETS(expr, str, fd) \
     do { \
         char* res = expr; \
-        if (!res || res != str) { \
-            LOG_ERROR("fgets error: ", strerror(errno)); \
+        int fgets_errno = errno; \
+        int pclose_ret = pclose(fd); \
+        int pclose_errno = errno; \
+        if (!res || res != str || pclose_ret) { \
+            if (!res || res != str) { \
+                LOG_ERROR("fgets error: ", strerror(fgets_errno)); \
+            } \
+            if (pclose_ret) { \
+                LOG_ERROR("pclose error: ", strerror(pclose_errno)); \
+            } \
             return KVS_STATUS_FAILURE; \
         } \
     } while (0)
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
index 04b069db6..46af60281 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/helper.cpp
@@ -31,35 +31,50 @@ int killed_ranks_count = 0;
 std::list<int> new_ranks;
 int new_ranks_count = 0;
 
+/*
+ * WARNING this module contains some legacy, unused or dead code
+ *
+ * some related code that lacked documentation and tests
+ * was annottated as deprecated
+ *
+ * removed, dead code - left for reference:
+ *
+ * kvs_status_t helper::replace_str(char* str, int old_rank, int new_rank) {
+ *    //    throw std::runtime_error("unexpected path");
+ *    LOG_ERROR("unexpected path");
+ *    return KVS_STATUS_FAILURE;
+ *
+ *    char old_str[INT_STR_SIZE];
+ *    char new_str[INT_STR_SIZE];
+ *    char* point_to_replace;
+ *    int old_str_size;
+ *    int new_str_size;
+ *
+ *    SET_STR(old_str, INT_STR_SIZE, RANK_TEMPLATE, old_rank);
+ *    SET_STR(new_str, INT_STR_SIZE, RANK_TEMPLATE, new_rank);
+ *
+ *    point_to_replace = strstr(str, old_str);
+ *    if (point_to_replace == NULL) {
+ *        LOG_ERROR("not found old rank(%d) in str(%s)", old_rank, str);
+ *        return KVS_STATUS_FAILURE;
+ *    }
+ *
+ *    old_str_size = strlen(old_str);
+ *    new_str_size = strlen(new_str);
+ *
+ *    if (old_str_size != new_str_size) {
+ *        size_t rest_len = strlen(point_to_replace) - old_str_size;
+ *        memmove(point_to_replace + new_str_size, point_to_replace + old_str_size, rest_len);
+ *    }
+ *    memcpy(point_to_replace, new_str, new_str_size);
+ *    return KVS_STATUS_SUCCESS;
+ * }
+ *
+ */
+
 kvs_status_t helper::replace_str(char* str, int old_rank, int new_rank) {
-    //    throw std::runtime_error("unexpected path");
-    LOG_ERROR("unexpected path");
+    // This code is a part of the deprecated and undocumnented resizable_pmi functionality
     return KVS_STATUS_FAILURE;
-
-    char old_str[INT_STR_SIZE];
-    char new_str[INT_STR_SIZE];
-    char* point_to_replace;
-    int old_str_size;
-    int new_str_size;
-
-    SET_STR(old_str, INT_STR_SIZE, RANK_TEMPLATE, old_rank);
-    SET_STR(new_str, INT_STR_SIZE, RANK_TEMPLATE, new_rank);
-
-    point_to_replace = strstr(str, old_str);
-    if (point_to_replace == NULL) {
-        LOG_ERROR("not found old rank(%d) in str(%s)", old_rank, str);
-        return KVS_STATUS_FAILURE;
-    }
-
-    old_str_size = strlen(old_str);
-    new_str_size = strlen(new_str);
-
-    if (old_str_size != new_str_size) {
-        size_t rest_len = strlen(point_to_replace) - old_str_size;
-        memmove(point_to_replace + new_str_size, point_to_replace + old_str_size, rest_len);
-    }
-    memcpy(point_to_replace, new_str, new_str_size);
-    return KVS_STATUS_SUCCESS;
 }
 
 kvs_status_t helper::update_ranks(int* old_count,
@@ -234,7 +249,9 @@ kvs_status_t helper::update_kvs_info(int new_rank) {
     char kvs_val[MAX_KVS_VAL_LENGTH];
     size_t kvs_list_size = get_kvs_list_size(ST_CLIENT);
 
+    // this code is a part of undocumented and untested resizable_pmi functionality
     for (size_t kvs_idx = 0; kvs_idx < kvs_list_size; kvs_idx++) {
+        // this code is a part of undocumented and untested resizable_pmi functionality
         cut_head(kvs_name, kvs_key, kvs_val, ST_CLIENT);
 
         KVS_CHECK_STATUS(remove_name_key(kvs_name, kvs_key), "failed to remove name and key");
@@ -251,6 +268,7 @@ kvs_status_t helper::update_kvs_info(int new_rank) {
 kvs_status_t helper::move_to_new_rank(int new_rank) {
     char rank_str[INT_STR_SIZE];
 
+    // this code is a part of undocumented resizable_pmi functionality
     KVS_CHECK_STATUS(update_kvs_info(new_rank), "failed to update kvs info");
     my_rank = new_rank;
 
@@ -265,6 +283,7 @@ kvs_status_t helper::move_to_new_rank(int new_rank) {
 kvs_status_t helper::update_my_info(const std::list<shift_rank_t>& list) {
     char rank_str[INT_STR_SIZE];
 
+    // this code is a part of undocumented resizable_pmi functionality
     for (const auto& it : list) {
         if (it.old_rank == static_cast<int>(my_rank)) {
             int old_rank = my_rank;
@@ -341,6 +360,7 @@ kvs_status_t helper::update(const std::list<shift_rank_t>& list,
                             std::list<int>& dead_up_idx,
                             int root_rank) {
     if (applied == 1) {
+        // this code is a part of undocumented resizable_pmi functionality
         if (!list.empty()) {
             if (static_cast<int>(my_rank) == root_rank) {
                 if (!dead_up_idx.empty()) {
@@ -410,6 +430,7 @@ kvs_status_t helper::get_count_requested_ranks(char* rank, size_t& count_pods_wi
 kvs_status_t helper::occupied_rank(char* rank) {
     std::string idx_val;
 
+    // this code is a part of undocumented resizable_pmi functionality
     KVS_CHECK_STATUS(get_value_by_name_key(KVS_UP, KVS_IDX, idx_val), "failed to get ID");
 
     if ((idx_val.empty()) && (my_rank == 0)) {
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
index 95e2200ce..07d89c388 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.cpp
@@ -192,6 +192,8 @@ kvs_status_t internal_kvs::init_main_server_by_string(const char* main_addr) {
     char* port = nullptr;
     KVS_CHECK_STATUS(local_server_address->set_sin_addr(local_host_ip), "failed to set main_ip");
 
+    CCL_ASSERT(server_listen_sock ==
+               INVALID_SOCKET); // make sure the socket is not initialized twice
     if ((server_listen_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
         LOG_ERROR("server_listen_sock init");
         return KVS_STATUS_FAILURE;
@@ -332,6 +334,8 @@ kvs_status_t internal_kvs::kvs_main_server_address_reserve(char* main_address) {
 
     KVS_CHECK_STATUS(fill_local_host_ip(), "failed to get local host IP");
 
+    CCL_ASSERT(server_listen_sock ==
+               INVALID_SOCKET); // make sure the socket is not initialized twice
     if ((server_listen_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
         LOG_ERROR("server_listen_sock init");
         return KVS_STATUS_FAILURE;
@@ -379,7 +383,8 @@ kvs_status_t internal_kvs::init_main_server_address(const char* main_addr) {
     if (server_address.empty()) {
         if (main_addr != NULL) {
             ip_getting_mode = IGT_ENV;
-            if (server_listen_sock == 0) {
+            if (server_listen_sock ==
+                INVALID_SOCKET) { // make sure the socket is not initialized twice
                 KVS_CHECK_STATUS(init_main_server_by_string(main_addr),
                                  "failed to init main server");
             }
@@ -392,6 +397,8 @@ kvs_status_t internal_kvs::init_main_server_address(const char* main_addr) {
 
     KVS_CHECK_STATUS(local_server_address->set_sin_addr(local_host_ip), "failed to set local_ip");
 
+    CCL_ASSERT(server_listen_sock ==
+               INVALID_SOCKET); // make sure the socket is not initialized twice
     if ((server_listen_sock = socket(address_family, SOCK_STREAM, 0)) < 0) {
         LOG_ERROR("server_listen_sock init");
         return KVS_STATUS_FAILURE;
@@ -502,7 +509,11 @@ kvs_status_t internal_kvs::kvs_init(const char* main_addr) {
         return KVS_STATUS_FAILURE;
     }
 
-    getsockname(server_control_sock, addr->get_sock_addr_ptr(), &len);
+    if (getsockname(server_control_sock, addr->get_sock_addr_ptr(), &len)) {
+        LOG_ERROR("server_control_sock getsockname");
+        return KVS_STATUS_FAILURE;
+    }
+
     server_args args;
     args.args = addr;
     args.sock_listener = server_listen_sock;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
index 103485e47..d04de2eaa 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs.h
@@ -82,8 +82,14 @@ class internal_kvs final : public ikvs_wrapper {
 
     kvs_status_t kvs_get_replica_size(size_t& replica_size) override;
 
+    internal_kvs() = default;
+
     ~internal_kvs() override;
 
+    internal_kvs& operator=(const internal_kvs&) = delete;
+
+    internal_kvs(const internal_kvs&) = delete;
+
     void set_server_address(const std::string& server_addr) {
         server_address = server_addr;
     }
@@ -106,18 +112,23 @@ class internal_kvs final : public ikvs_wrapper {
     std::list<std::string> local_host_ipv6s;
     char local_host_ip[CCL_IP_LEN];
 
-    size_t main_port;
-    size_t local_port;
+    size_t main_port = 0;
+    size_t local_port = 0;
     size_t is_master = 0;
     std::mutex client_memory_mutex;
 
     std::shared_ptr<isockaddr> main_server_address;
     std::shared_ptr<isockaddr> local_server_address;
+    static constexpr int INVALID_SOCKET = -1;
 
-    int client_op_sock; /* used on client side to send commands and to recv result to/from server */
+    int client_op_sock{
+        INVALID_SOCKET
+    }; /* used on client side to send commands and to recv result to/from server */
 
-    int client_control_sock; /* used on client side to control local kvs server */
-    int server_control_sock; /* used on server side to be controlled by local client */
+    int client_control_sock{ INVALID_SOCKET }; /* used on client side to control local kvs server */
+    int server_control_sock{
+        INVALID_SOCKET
+    }; /* used on server side to be controlled by local client */
 
     typedef enum ip_getting_type {
         IGT_ENV = 0,
@@ -134,7 +145,9 @@ class internal_kvs final : public ikvs_wrapper {
 
     const int CONNECTION_TIMEOUT = 120;
 
-    int server_listen_sock; /* used on server side to handle new incoming connect requests from clients */
+    int server_listen_sock{
+        INVALID_SOCKET
+    }; /* used on server side to handle new incoming connect requests from clients */
     std::string server_address{};
 
     sa_family_t address_family{ AF_UNSPEC };
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
index ab862188d..aa54bf80e 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.cpp
@@ -93,6 +93,7 @@ kvs_status_t server::try_to_connect_new() {
         }
 
         int new_socket;
+        bool socket_found = false;
         socklen_t peer_addr_size = addr->size();
         if ((new_socket = accept(poll_fds[FDI_LISTENER].fd,
                                  addr->get_sock_addr_ptr(),
@@ -103,9 +104,23 @@ kvs_status_t server::try_to_connect_new() {
         for (size_t i = FDI_LAST; i < poll_fds.size(); i++) {
             if (poll_fds[i].fd == free_socket) {
                 poll_fds[i].fd = new_socket;
+                socket_found = true;
                 break;
             }
         }
+        if (!socket_found) {
+            // the code is written in a way that there should always be a free socket available
+            // if no socket is found, this means that there is an error in the code
+            // or that out of memory exception occurred while resizing the poll_fds vector
+            // and it was not properly handled in the layers above internal_kvs_server
+            LOG_ERROR("free socket not found; this indicates programmer's error");
+            if (close(new_socket)) {
+                // we are already returning failure, there is not much we can do
+                // except for logging the exact error that occurred
+                LOG_ERROR("error closing a socket, %s", strerror(errno));
+            }
+            return KVS_STATUS_FAILURE;
+        }
         client_count++;
         if (poll_fds.size() - FDI_LAST == client_count) {
             size_t old_size = poll_fds.size();
@@ -338,6 +353,12 @@ kvs_status_t server::check_finalize(size_t& to_finalize) {
 kvs_status_t server::run(void* args) {
     size_t should_stop = false;
     int so_reuse = 1;
+#ifdef SO_REUSEPORT
+    int reuse_optname = SO_REUSEPORT;
+#else
+    int reuse_optname = SO_REUSEADDR;
+#endif
+
     poll_fds.resize(client_count_increase);
     for (auto& it : poll_fds) {
         it.fd = free_socket;
@@ -346,11 +367,11 @@ kvs_status_t server::run(void* args) {
     poll_fds[FDI_LISTENER].fd = ((server_args_t*)args)->sock_listener;
     address_family = ((server_args_t*)args)->args->sin_family();
 
-#ifdef SO_REUSEPORT
-    setsockopt(poll_fds[FDI_LISTENER].fd, SOL_SOCKET, SO_REUSEPORT, &so_reuse, sizeof(so_reuse));
-#else
-    setsockopt(poll_fds[FDI_LISTENER].fd, SOL_SOCKET, SO_REUSEADDR, &so_reuse, sizeof(so_reuse));
-#endif
+    if (setsockopt(
+            poll_fds[FDI_LISTENER].fd, SOL_SOCKET, reuse_optname, &so_reuse, sizeof(so_reuse))) {
+        LOG_ERROR("server_listen_sock setsockopt(%s)", strerror(errno));
+        return KVS_STATUS_FAILURE;
+    }
 
     if (listen(poll_fds[FDI_LISTENER].fd, max_client_queue_size) < 0) {
         LOG_ERROR("server_listen_sock listen(%s)", strerror(errno));
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
index 7e287b636..e7c8eea43 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/kvs/internal_kvs_server.hpp
@@ -14,10 +14,17 @@
  limitations under the License.
 */
 #pragma once
+
+#include <cstdint>
+
 #include "util/pm/pmi_resizable_rt/pmi_resizable/def.h"
 #include "internal_kvs.h"
 
-enum kvs_access_mode_t : int {
+/// kvs_access_mode_t is serialized to 4-byte-long buffer, as little endian
+/// uint32_t is chosen:
+/// 1. There is a bijective relation uint32_t -> uint8_t[4]
+/// 2. No corner case handling for negative values is required
+enum kvs_access_mode_t : uint32_t {
     AM_CLOSE = 1,
     AM_PUT = 2,
     AM_REMOVE = 3,
@@ -42,8 +49,20 @@ class kvs_request_t {
                      const std::string& kvs_val = {}) {
         std::vector<char> put_buf(put_buf_size, 0);
         size_t step = 0;
-        std::string put_mode_str = std::to_string(put_mode);
-        std::copy(put_mode_str.begin(), put_mode_str.end(), put_buf.begin());
+        // convert local endianness to little endian
+        uint8_t put_mode_buff[4]{
+            (static_cast<uint8_t>((static_cast<uint32_t>(put_mode) >> 0) & 0xFF)),
+            (static_cast<uint8_t>((static_cast<uint32_t>(put_mode) >> 8) & 0xFF)),
+            (static_cast<uint8_t>((static_cast<uint32_t>(put_mode) >> 16) & 0xFF)),
+            (static_cast<uint8_t>((static_cast<uint32_t>(put_mode) >> 24) & 0xFF)),
+        };
+        static_assert(sizeof(put_mode_buff) == sizeof(mode),
+                      "`mode` size is no longer compatible with `put_mode_buff`");
+        static_assert(sizeof(put_mode_buff) == sizeof(put_mode),
+                      "`put_mode` size is no longer compatible with `put_mode_buff`");
+        static_assert(sizeof(put_mode_buff) == sizeof(kvs_access_mode_t),
+                      "`kvs_access_mode_t` size is no longer compatible with `put_mode_buff`");
+        std::copy(std::begin(put_mode_buff), std::end(put_mode_buff), put_buf.begin());
 
         if (!kvs_name.empty()) {
             KVS_ERROR_IF_NOT(kvs_name.length() <= MAX_KVS_NAME_LENGTH);
@@ -151,10 +170,15 @@ class kvs_request_t {
             mode = AM_CLOSE;
             return KVS_STATUS_SUCCESS;
         }
-        int tmp_mode;
-        safe_strtol(get_buf.data(), tmp_mode);
-        mode = static_cast<kvs_access_mode_t>(tmp_mode);
-        auto it_get_buf = get_buf.begin() + sizeof(mode);
+        auto it_get_buf = get_buf.begin();
+        uint8_t temp_mode[4]{ 0 };
+        static_assert(sizeof(temp_mode) == sizeof(kvs_access_mode_t),
+                      "`kvs_access_mode_t` size is no longer compatible with `temp_mode`");
+        std::copy(it_get_buf, it_get_buf + sizeof(mode), temp_mode);
+        // convert little-endian from serialized buffer to host endianness
+        mode = static_cast<kvs_access_mode_t>((temp_mode[0] << 0) | (temp_mode[1] << 8) |
+                                              (temp_mode[2] << 16) | (temp_mode[3] << 24));
+        it_get_buf += sizeof(mode);
         std::copy(it_get_buf, it_get_buf + sizeof(name), name);
         it_get_buf += sizeof(name);
         std::copy(it_get_buf, it_get_buf + sizeof(key), key);
@@ -166,7 +190,7 @@ class kvs_request_t {
 
 private:
     friend class server;
-    kvs_access_mode_t mode{ AM_PUT };
+    kvs_access_mode_t mode{ AM_PUT }; // serialized to little-endian byte order for buffer transfers
     char name[MAX_KVS_NAME_LENGTH]{};
     char key[MAX_KVS_KEY_LENGTH]{};
     char val[MAX_KVS_VAL_LENGTH]{};
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
index e5b79cdc0..365e9a86a 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/pmi_listener.cpp
@@ -54,8 +54,7 @@ kvs_status_t pmi_listener::collect_sock_addr(std::shared_ptr<helper> h) {
         LOG_ERROR("Can't get host IP");
         return KVS_STATUS_FAILURE;
     }
-    CHECK_FGETS(fgets(my_ip, MAX_KVS_VAL_LENGTH, fp), my_ip);
-    pclose(fp);
+    CHECK_FGETS(fgets(my_ip, MAX_KVS_VAL_LENGTH, fp), my_ip, fp);
     while (my_ip[strlen(my_ip) - 1] == '\n' || my_ip[strlen(my_ip) - 1] == ' ')
         my_ip[strlen(my_ip) - 1] = '\0';
     if ((point_to_space = strstr(my_ip, " ")) != NULL)
@@ -135,18 +134,32 @@ kvs_status_t pmi_listener::clean_listener(std::shared_ptr<helper> h) {
 
 kvs_status_t pmi_listener::send_notification(int sig, std::shared_ptr<helper> h) {
     size_t i;
+    ssize_t sendto_ret = 0;
     char message[INT_STR_SIZE];
 
     KVS_CHECK_STATUS(collect_sock_addr(h), "failed to collect sock info");
 
     SET_STR(message, INT_STR_SIZE, "%s", "Update!");
     for (i = 0; i < num_listeners; ++i) {
-        sendto(sock_sender,
-               message,
-               INT_STR_SIZE,
-               MSG_DONTWAIT,
-               (const struct sockaddr*)&(server_addresses[i]),
-               sizeof(server_addresses[i]));
+        sendto_ret = sendto(sock_sender,
+                            message,
+                            INT_STR_SIZE,
+                            MSG_DONTWAIT,
+                            (const struct sockaddr*)&(server_addresses[i]),
+                            sizeof(server_addresses[i]));
+        if (sendto_ret != INT_STR_SIZE) {
+            if (sendto_ret == -1) {
+                LOG_ERROR("sendto error occurred,%s", strerror(errno));
+            }
+            else {
+                LOG_ERROR(
+                    "notification underflow error occurred, %zd/%zd", sendto_ret, INT_STR_SIZE);
+            }
+            if (sig) {
+                KVS_CHECK_STATUS(clean_listener(h), "failed to clean listener");
+            }
+            return KVS_STATUS_FAILURE;
+        }
     }
     if (sig) {
         KVS_CHECK_STATUS(clean_listener(h), "failed to clean listener");
@@ -173,8 +186,7 @@ kvs_status_t pmi_listener::run_listener(std::shared_ptr<helper> h) {
             printf("Can't get host IP\n");
             exit(1);
         }
-        CHECK_FGETS(fgets(my_ip, MAX_KVS_VAL_LENGTH, fp), my_ip);
-        pclose(fp);
+        CHECK_FGETS(fgets(my_ip, MAX_KVS_VAL_LENGTH, fp), my_ip, fp);
         while (my_ip[strlen(my_ip) - 1] == '\n' || my_ip[strlen(my_ip) - 1] == ' ')
             my_ip[strlen(my_ip) - 1] = '\0';
         if ((point_to_space = strstr(my_ip, " ")) != NULL)
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
index 00af8c1ee..69b8b2b5c 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable/resizable_pmi.cpp
@@ -73,6 +73,7 @@ kvs_status_t pmi_resizable::PMIR_Update(void) {
     std::list<int> dead_up_idx{};
     std::list<shift_rank_t> list{};
 
+    // this code is a part of undocumented resizable_pmi functionality
     new_ranks_count = 0;
     killed_ranks_count = 0;
     if (finalized == 1) {
@@ -115,6 +116,7 @@ kvs_status_t pmi_resizable::PMIR_Update(void) {
                     //                    while (int_list_is_contained(killed_ranks, root_rank) == 1)
                     {
                         int old_root = root_rank;
+
                         KVS_CHECK_STATUS(h->get_new_root(&root_rank), "failed to new root rank");
 
                         if (my_rank == root_rank && old_root != root_rank)
@@ -188,6 +190,7 @@ kvs_status_t pmi_resizable::PMIR_Update(void) {
                 }
                 case KVS_RA_FINALIZE: {
                     KVS_CHECK_STATUS(PMIR_Finalize(), "failed to finalize");
+                    break;
                 }
                 default: {
                     LOG_ERROR("Unknown resize action: %d\n", answer);
@@ -205,6 +208,7 @@ kvs_status_t pmi_resizable::PMIR_Update(void) {
 
     h->get_shift(list);
     count_pods = count_pods - killed_ranks_count + new_ranks_count;
+
     KVS_CHECK_STATUS(h->update(list, dead_up_idx, root_rank), "failed to update root");
 
     root_rank = 0;
@@ -249,8 +253,7 @@ kvs_status_t pmi_resizable::PMIR_Init(const char* addr) {
         printf("Can't get hostname\n");
         exit(1);
     }
-    CHECK_FGETS(fgets(pmi_hostname, MAX_KVS_VAL_LENGTH, fp), pmi_hostname);
-    pclose(fp);
+    CHECK_FGETS(fgets(pmi_hostname, MAX_KVS_VAL_LENGTH, fp), pmi_hostname, fp);
     while (pmi_hostname[strlen(pmi_hostname) - 1] == '\n' ||
            pmi_hostname[strlen(pmi_hostname) - 1] == ' ')
         pmi_hostname[strlen(pmi_hostname) - 1] = '\0';
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
index d350fa830..111b52706 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple.h
@@ -49,6 +49,10 @@ class pmi_resizable_simple final : public ipmi {
 
     ~pmi_resizable_simple() override;
 
+    pmi_resizable_simple& operator=(const pmi_resizable_simple&) = delete;
+
+    pmi_resizable_simple(const pmi_resizable_simple&) = delete;
+
     int is_pm_resize_enabled() override;
 
     atl_status_t pmrt_main_addr_reserve(char* main_addr) override;
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
index 4138297c4..28c308d10 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.cpp
@@ -38,7 +38,7 @@ pmi_resizable_simple_internal::pmi_resizable_simple_internal(int size,
                                                              const char* main_addr)
         : comm_size(size),
           ranks(ranks),
-          k(k),
+          k(std::move(k)),
           main_addr(main_addr),
           max_keylen(MAX_KVS_KEY_LENGTH),
           max_vallen(MAX_KVS_VAL_LENGTH),
@@ -103,15 +103,19 @@ atl_status_t pmi_resizable_simple_internal::registration() {
 
     char* proc_count_str = const_cast<char*>(val_storage_vec.data());
     char* rank_str = strstr(proc_count_str, "_");
+    ATL_CHECK_PTR(rank_str, "proc_count_str contains corrupted data");
     rank_str[0] = '\0';
     rank_str++;
     char* proc_rank_count_str = strstr(rank_str, "_");
+    ATL_CHECK_PTR(proc_rank_count_str, "proc_count_str contains corrupted data");
     proc_rank_count_str[0] = '\0';
     proc_rank_count_str++;
     char* threads_count_str = strstr(proc_rank_count_str, "_");
+    ATL_CHECK_PTR(threads_count_str, "proc_count_str contains corrupted data");
     threads_count_str[0] = '\0';
     threads_count_str++;
     char* thread_num_str = strstr(threads_count_str, "_");
+    ATL_CHECK_PTR(thread_num_str, "proc_count_str contains corrupted data");
     thread_num_str[0] = '\0';
     thread_num_str++;
 
diff --git a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h
index 53222ffb8..19ff9e7c1 100644
--- a/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h
+++ b/src/atl/util/pm/pmi_resizable_rt/pmi_resizable_simple_internal.h
@@ -49,6 +49,10 @@ class pmi_resizable_simple_internal final : public ipmi {
 
     ~pmi_resizable_simple_internal() override;
 
+    pmi_resizable_simple_internal& operator=(const pmi_resizable_simple_internal&) = delete;
+
+    pmi_resizable_simple_internal(const pmi_resizable_simple_internal&) = delete;
+
     int is_pm_resize_enabled() override;
 
     atl_status_t pmrt_main_addr_reserve(char* main_addr) override;
diff --git a/src/coll/algorithms/algorithm_utils.cpp b/src/coll/algorithms/algorithm_utils.cpp
index e70cad1fb..fdab96d67 100644
--- a/src/coll/algorithms/algorithm_utils.cpp
+++ b/src/coll/algorithms/algorithm_utils.cpp
@@ -19,8 +19,10 @@
 
 #include "coll/algorithms/algorithm_utils.hpp"
 #include "common/log/log.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
 
 const char* ccl_coll_type_to_str(ccl_coll_type type) {
+    auto type_str = "undefined";
     switch (type) {
         case ccl_coll_allgatherv: return "allgatherv";
         case ccl_coll_allreduce: return "allreduce";
@@ -28,13 +30,15 @@ const char* ccl_coll_type_to_str(ccl_coll_type type) {
         case ccl_coll_alltoallv: return "alltoallv";
         case ccl_coll_barrier: return "barrier";
         case ccl_coll_bcast: return "bcast";
+        case ccl_coll_recv: return "recv";
         case ccl_coll_reduce: return "reduce";
         case ccl_coll_reduce_scatter: return "reduce_scatter";
+        case ccl_coll_send: return "send";
         case ccl_coll_partial: return "partial";
-        case ccl_coll_undefined: return "undefined";
-        default: return "unknown";
+        case ccl_coll_undefined: return type_str;
+        default: type_str = "unknown";
     }
-    return "unknown";
+    return type_str;
 }
 
 void ccl_get_segment_sizes(size_t dtype_size,
@@ -59,7 +63,8 @@ void ccl_get_segment_sizes(size_t dtype_size,
         seg_sizes.resize(total_seg_count, regular_seg_size);
         std::fill(seg_sizes.begin() + regular_seg_count, seg_sizes.end(), large_seg_size);
 
-        size_t sum = std::accumulate(seg_sizes.begin(), seg_sizes.end(), 0);
+        size_t sum =
+            std::accumulate(seg_sizes.begin(), seg_sizes.end(), ccl::utils::initial_count_value);
         if (sum != elem_count) {
             std::stringstream ss;
             for (size_t idx = 0; idx < seg_sizes.size(); idx++) {
@@ -83,3 +88,52 @@ void ccl_get_segment_sizes(size_t dtype_size,
         }
     }
 }
+
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+
+uint32_t submit_ze_commands_in_subsched_entries(ccl_sched* sched) {
+    std::vector<subsched_entry*> subsched_chunks;
+    for (auto& entry : sched->entries) {
+        if (!strncmp(entry->name(), "ALLREDUCE_PIPE", strlen("ALLREDUCE_PIPE"))) {
+            subsched_chunks.push_back(static_cast<subsched_entry*>(entry.get()));
+        }
+    }
+
+    auto chunk_count = subsched_chunks.size();
+    LOG_DEBUG("chunk_count ", chunk_count);
+
+    std::vector<size_t> next_entry(chunk_count, 0);
+    bool done = false;
+    uint32_t command_count = 0;
+    int cmd_idx = 0;
+    while (!done) {
+        done = true;
+        for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+            LOG_DEBUG("cmd_idx=",
+                      cmd_idx,
+                      ", chunk_idx=",
+                      chunk_idx,
+                      ", | ",
+                      subsched_chunks[chunk_idx]->name(),
+                      ", entries.size=",
+                      subsched_chunks[chunk_idx]->get_subsched()->entries.size(),
+                      ", next_entry=",
+                      next_entry[chunk_idx]);
+            if (next_entry[chunk_idx] <
+                subsched_chunks[chunk_idx]->get_subsched()->entries.size()) {
+                LOG_DEBUG("cmd_idx=", cmd_idx, ", chunk_idx=", chunk_idx, ", submitting commands");
+                command_count += subsched_chunks[chunk_idx]
+                                     ->get_subsched()
+                                     ->entries[next_entry[chunk_idx]++]
+                                     ->ze_commands_submit();
+                done = false;
+            }
+            LOG_DEBUG("cmd_idx=", cmd_idx, ", chunk_idx=", chunk_idx, ", done=", done);
+        }
+        ++cmd_idx;
+    }
+
+    return command_count;
+}
+
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/coll/algorithms/algorithm_utils.hpp b/src/coll/algorithms/algorithm_utils.hpp
index 47c88ed9e..86c522a0b 100644
--- a/src/coll/algorithms/algorithm_utils.hpp
+++ b/src/coll/algorithms/algorithm_utils.hpp
@@ -22,7 +22,8 @@
 
 #define CCL_COLL_LIST \
     ccl_coll_allgatherv, ccl_coll_allreduce, ccl_coll_alltoall, ccl_coll_alltoallv, \
-        ccl_coll_barrier, ccl_coll_bcast, ccl_coll_reduce, ccl_coll_reduce_scatter
+        ccl_coll_barrier, ccl_coll_bcast, ccl_coll_recv, ccl_coll_reduce, ccl_coll_reduce_scatter, \
+        ccl_coll_send
 
 enum ccl_coll_allgatherv_algo {
     ccl_coll_allgatherv_undefined = 0,
@@ -84,6 +85,14 @@ enum ccl_coll_bcast_algo {
     ccl_coll_bcast_topo
 };
 
+enum ccl_coll_recv_algo {
+    ccl_coll_recv_undefined = 0,
+
+    ccl_coll_recv_direct,
+    ccl_coll_recv_offload,
+    ccl_coll_recv_topo
+};
+
 enum ccl_coll_reduce_algo {
     ccl_coll_reduce_undefined = 0,
 
@@ -103,6 +112,14 @@ enum ccl_coll_reduce_scatter_algo {
     ccl_coll_reduce_scatter_topo
 };
 
+enum ccl_coll_send_algo {
+    ccl_coll_send_undefined = 0,
+
+    ccl_coll_send_direct,
+    ccl_coll_send_offload,
+    ccl_coll_send_topo
+};
+
 union ccl_coll_algo {
     ccl_coll_allgatherv_algo allgatherv;
     ccl_coll_allreduce_algo allreduce;
@@ -110,8 +127,10 @@ union ccl_coll_algo {
     ccl_coll_alltoallv_algo alltoallv;
     ccl_coll_barrier_algo barrier;
     ccl_coll_bcast_algo bcast;
+    ccl_coll_recv_algo recv;
     ccl_coll_reduce_algo reduce;
     ccl_coll_reduce_scatter_algo reduce_scatter;
+    ccl_coll_send_algo send;
     int value;
 
     ccl_coll_algo() : value(0) {}
@@ -127,9 +146,9 @@ enum ccl_coll_type {
     ccl_coll_alltoallv,
     ccl_coll_barrier,
     ccl_coll_bcast,
+    ccl_coll_recv,
     ccl_coll_reduce,
     ccl_coll_reduce_scatter,
-    ccl_coll_recv,
     ccl_coll_send,
     ccl_coll_last_regular = ccl_coll_send,
 
@@ -145,3 +164,9 @@ void ccl_get_segment_sizes(size_t dtype_size,
                            size_t elem_count,
                            size_t requested_seg_size,
                            std::vector<size_t>& seg_sizes);
+
+class ccl_sched;
+
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+uint32_t submit_ze_commands_in_subsched_entries(ccl_sched* sched);
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/coll/algorithms/algorithms.hpp b/src/coll/algorithms/algorithms.hpp
index fff8b9431..6e90a12d0 100644
--- a/src/coll/algorithms/algorithms.hpp
+++ b/src/coll/algorithms/algorithms.hpp
@@ -178,6 +178,23 @@ ccl::status ccl_coll_build_topo_bcast(ccl_sched* sched,
                                       ccl_comm* comm);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
+// recv
+ccl::status ccl_coll_build_direct_recv(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int peer_rank,
+                                       ccl_comm* comm);
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_recv(ccl_sched* sched,
+                                     ccl_buffer buf,
+                                     size_t count,
+                                     const ccl_datatype& dtype,
+                                     int peer_rank,
+                                     ccl_comm* comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 // reduce
 ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
                                          ccl_buffer send_buf,
@@ -254,6 +271,23 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                                                ccl_comm* comm);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
+// send
+ccl::status ccl_coll_build_direct_send(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int peer_rank,
+                                       ccl_comm* comm);
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_send(ccl_sched* sched,
+                                     ccl_buffer buf,
+                                     size_t count,
+                                     const ccl_datatype& dtype,
+                                     int peer_rank,
+                                     ccl_comm* comm);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 class ccl_double_tree;
 ccl::status ccl_coll_build_double_tree_op(ccl_sched* sched,
                                           ccl_coll_type coll_type,
diff --git a/src/coll/algorithms/allgatherv.cpp b/src/coll/algorithms/allgatherv.cpp
index 97a8d07ce..428a2e589 100644
--- a/src/coll/algorithms/allgatherv.cpp
+++ b/src/coll/algorithms/allgatherv.cpp
@@ -45,6 +45,11 @@ ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
                                             const ccl_datatype& dtype,
                                             ccl_comm* comm) {
     LOG_DEBUG("build naive allgatherv");
+    CCL_THROW_IF_NOT(recv_counts[comm->rank()] == send_count,
+                     "unexpected send count: ",
+                     send_count,
+                     " vs ",
+                     recv_counts[comm->rank()]);
 
     ccl::status status = ccl::status::success;
 
@@ -58,7 +63,7 @@ ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
         offsets[rank] = offsets[rank - 1] + recv_counts[rank - 1] * dtype_size;
     }
 
-    if (send_buf != recv_buf) {
+    if ((send_buf != recv_buf) && (send_count > 0)) {
         // out-of-place case
         entry_factory::create<copy_entry>(
             sched, send_buf, recv_buf + offsets[comm_rank], send_count, dtype);
@@ -68,13 +73,17 @@ ccl::status ccl_coll_build_naive_allgatherv(ccl_sched* sched,
         int dst = (comm_rank + idx) % comm_size;
         int src = (comm_rank - idx + comm_size) % comm_size;
 
-        // send own buffer to other ranks
-        entry_factory::create<send_entry>(
-            sched, recv_buf + offsets[comm_rank], send_count, dtype, dst, comm);
+        if (send_count > 0) {
+            // send own buffer to other ranks
+            entry_factory::create<send_entry>(
+                sched, recv_buf + offsets[comm_rank], send_count, dtype, dst, comm);
+        }
 
-        // recv other's rank buffer
-        entry_factory::create<recv_entry>(
-            sched, recv_buf + offsets[src], recv_counts[src], dtype, src, comm);
+        if (recv_counts[src] > 0) {
+            // recv other's rank buffer
+            entry_factory::create<recv_entry>(
+                sched, recv_buf + offsets[src], recv_counts[src], dtype, src, comm);
+        }
     }
 
     return status;
@@ -91,6 +100,11 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
     LOG_DEBUG("build ring allgatherv, send_count ", send_count);
     CCL_THROW_IF_NOT(main_sched || (!main_sched && scheds.size() == 1),
                      "unexpected scheduler/sub-schedulers combination");
+    CCL_THROW_IF_NOT(recv_counts[comm->rank()] == send_count,
+                     "unexpected send count: ",
+                     send_count,
+                     " vs ",
+                     recv_counts[comm->rank()]);
 
     int rank = comm->rank();
     int comm_size = comm->size();
@@ -103,7 +117,7 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
         offsets[rank_idx] = offsets[rank_idx - 1] + recv_counts[rank_idx - 1] * dtype_size;
     }
 
-    if (send_buf != recv_buf) {
+    if ((send_buf != recv_buf) && (send_count > 0)) {
         // initialize recv_buffer with initial send_buf value
         // scheds.front contains either main scheduler or first sub-scheduler
         entry_factory::create<copy_entry>(
@@ -151,18 +165,22 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
         }
 
         for (size_t s_idx = 0; s_idx < sched_count; s_idx++) {
-            entry_factory::create<send_entry>(scheds[s_idx],
-                                              sbuf + send_sched_offset[s_idx],
-                                              send_block_thread_counts[s_idx],
-                                              dtype,
-                                              dst,
-                                              comm);
-            entry_factory::create<recv_entry>(scheds[s_idx],
-                                              rbuf + recv_sched_offset[s_idx],
-                                              recv_block_thread_counts[s_idx],
-                                              dtype,
-                                              src,
-                                              comm);
+            if (send_block_thread_counts[s_idx]) {
+                entry_factory::create<send_entry>(scheds[s_idx],
+                                                  sbuf + send_sched_offset[s_idx],
+                                                  send_block_thread_counts[s_idx],
+                                                  dtype,
+                                                  dst,
+                                                  comm);
+            }
+            if (recv_block_thread_counts[s_idx]) {
+                entry_factory::create<recv_entry>(scheds[s_idx],
+                                                  rbuf + recv_sched_offset[s_idx],
+                                                  recv_block_thread_counts[s_idx],
+                                                  dtype,
+                                                  src,
+                                                  comm);
+            }
             // recv_entry has to be completed before the send_entry operation
             // in the next loop iteration, we are sending the received data-block forward
             // following the ring algorithm. Therefore, barrier is needed.
@@ -179,14 +197,12 @@ ccl::status ccl_coll_build_ring_allgatherv(ccl_sched* main_sched,
     return ccl::status::success;
 }
 
-ccl::status ccl_coll_get_allgatherv_bufs_and_offsets(const ccl_coll_param& coll_param,
-                                                     std::vector<ccl_buffer>& recv_bufs,
-                                                     std::vector<size_t>& recv_offsets) {
+ccl::status ccl_coll_get_allgatherv_bufs(const ccl_coll_param& coll_param,
+                                         std::vector<ccl_buffer>& recv_bufs) {
     int comm_size = coll_param.comm->size();
     size_t dtype_size = coll_param.dtype.size();
 
     recv_bufs.resize(comm_size);
-    recv_offsets.resize(comm_size);
 
     if (coll_param.recv_bufs.size() > 1) {
         CCL_THROW_IF_NOT((int)coll_param.recv_bufs.size() == comm_size,
@@ -198,7 +214,6 @@ ccl::status ccl_coll_get_allgatherv_bufs_and_offsets(const ccl_coll_param& coll_
         for (int idx = 0; idx < comm_size; idx++) {
             recv_bufs[idx].set(coll_param.get_recv_buf(idx),
                                coll_param.get_recv_count(idx) * dtype_size);
-            recv_offsets[idx] = 0;
         }
     }
     else {
@@ -206,7 +221,6 @@ ccl::status ccl_coll_get_allgatherv_bufs_and_offsets(const ccl_coll_param& coll_
         for (int idx = 0; idx < comm_size; idx++) {
             size_t bytes = coll_param.get_recv_count(idx) * dtype_size;
             recv_bufs[idx].set(coll_param.get_recv_buf(), offset + bytes, offset);
-            recv_offsets[idx] = offset;
             offset += bytes;
         }
     }
@@ -231,24 +245,27 @@ ccl::status ccl_coll_build_flat_allgatherv(ccl_sched* main_sched,
     bool inplace = coll_param.is_inplace();
 
     std::vector<ccl_buffer> recv_bufs;
-    std::vector<size_t> recv_offsets;
-    ccl_coll_get_allgatherv_bufs_and_offsets(coll_param, recv_bufs, recv_offsets);
+    ccl_coll_get_allgatherv_bufs(coll_param, recv_bufs);
 
     auto send_seg = ccl_buffer(coll_param.get_send_buf(), coll_param.get_send_count() * dtype_size);
 
     if (!inplace) {
-        entry_factory::create<copy_entry>(
-            scheds[2 * comm_rank % sched_count],
-            ccl_buffer(coll_param.get_send_buf(), coll_param.get_send_count() * dtype_size),
-            recv_bufs[comm_rank],
-            coll_param.get_recv_count(comm_rank),
-            dtype);
+        if (coll_param.get_recv_count(comm_rank)) {
+            entry_factory::create<copy_entry>(
+                scheds[2 * comm_rank % sched_count],
+                ccl_buffer(coll_param.get_send_buf(), coll_param.get_send_count() * dtype_size),
+                recv_bufs[comm_rank],
+                coll_param.get_recv_count(comm_rank),
+                dtype);
+        }
     }
     else {
-        size_t total_recv_bytes =
-            std::accumulate(coll_param.recv_counts.begin(), coll_param.recv_counts.end(), 0) *
-            dtype_size;
-        send_seg = ccl_buffer(coll_param.get_send_buf(), total_recv_bytes, recv_offsets[comm_rank]);
+        size_t total_recv_bytes = std::accumulate(coll_param.recv_counts.begin(),
+                                                  coll_param.recv_counts.end(),
+                                                  ccl::utils::initial_count_value) *
+                                  dtype_size;
+        send_seg = ccl_buffer(
+            coll_param.get_send_buf(), total_recv_bytes, recv_bufs[comm_rank].get_offset());
     }
 
     CCL_THROW_IF_NOT(static_cast<int>(sched_count) == comm_size || !main_sched,
@@ -262,19 +279,23 @@ ccl::status ccl_coll_build_flat_allgatherv(ccl_sched* main_sched,
         if (static_cast<int>(idx) == comm_rank)
             continue;
 
-        entry_factory::create<recv_entry>(scheds[(comm_rank + idx) % sched_count],
-                                          recv_bufs[idx],
-                                          coll_param.get_recv_count(idx),
-                                          dtype,
-                                          idx,
-                                          comm);
-
-        entry_factory::create<send_entry>(scheds[(comm_rank + idx) % sched_count],
-                                          send_seg,
-                                          coll_param.get_recv_count(comm_rank),
-                                          dtype,
-                                          idx,
-                                          comm);
+        if (coll_param.get_recv_count(idx)) {
+            entry_factory::create<recv_entry>(scheds[(comm_rank + idx) % sched_count],
+                                              recv_bufs[idx],
+                                              coll_param.get_recv_count(idx),
+                                              dtype,
+                                              idx,
+                                              comm);
+        }
+
+        if (coll_param.get_recv_count(comm_rank)) {
+            entry_factory::create<send_entry>(scheds[(comm_rank + idx) % sched_count],
+                                              send_seg,
+                                              coll_param.get_recv_count(comm_rank),
+                                              dtype,
+                                              idx,
+                                              comm);
+        }
     }
     if (main_sched) {
         main_sched->sync_subscheds();
@@ -303,8 +324,7 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_sched* main_sched,
     bool inplace = coll_param.is_inplace();
 
     std::vector<ccl_buffer> recv_bufs;
-    std::vector<size_t> recv_offsets;
-    ccl_coll_get_allgatherv_bufs_and_offsets(coll_param, recv_bufs, recv_offsets);
+    ccl_coll_get_allgatherv_bufs(coll_param, recv_bufs);
 
     if (!inplace) {
         std::vector<size_t> copy_counts(data_partition_count);
@@ -319,14 +339,17 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_sched* main_sched,
         CCL_ASSERT(scheds.size() >= data_partition_count);
 
         for (size_t idx = 0; idx < data_partition_count; idx++) {
-            entry_factory::create<copy_entry>(scheds[idx],
-                                              ccl_buffer(coll_param.get_send_buf_ptr(),
-                                                         coll_param.get_send_count() * dtype_size,
-                                                         copy_offsets[idx],
-                                                         ccl_buffer_type::INDIRECT),
-                                              recv_bufs[comm_rank] + copy_offsets[idx],
-                                              copy_counts[idx],
-                                              dtype);
+            if (copy_counts[idx]) {
+                ccl_buffer cbuf(coll_param.get_send_buf_ptr(),
+                                coll_param.get_send_count() * dtype_size,
+                                copy_offsets[idx],
+                                ccl_buffer_type::INDIRECT);
+                entry_factory::create<copy_entry>(scheds[idx],
+                                                  cbuf,
+                                                  recv_bufs[comm_rank] + copy_offsets[idx],
+                                                  copy_counts[idx],
+                                                  dtype);
+            }
         }
         if (main_sched) {
             main_sched->sync_subscheds();
@@ -334,15 +357,17 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_sched* main_sched,
     }
 
     for (int idx = 0; idx < comm_size; idx++) {
-        ccl_coll_entry_param param{};
-        param.ctype = ccl_coll_bcast;
-        param.recv_buf = recv_bufs[idx];
-        param.count = coll_param.get_recv_count(idx);
-        param.dtype = dtype;
-        param.root = idx;
-        param.comm = comm;
-        param.stream = coll_param.stream;
-        ccl::add_coll_entry(scheds[idx % sched_count], param);
+        if (coll_param.get_recv_count(idx)) {
+            ccl_coll_param param{ false };
+            param.ctype = ccl_coll_bcast;
+            param.recv_buf = recv_bufs[idx];
+            param.count = coll_param.get_recv_count(idx);
+            param.dtype = dtype;
+            param.root = idx;
+            param.comm = comm;
+            param.stream = coll_param.stream;
+            ccl::add_coll_entry(scheds[idx % sched_count], param);
+        }
     }
 
     return ccl::status::success;
@@ -350,31 +375,14 @@ ccl::status ccl_coll_build_multi_bcast_allgatherv(ccl_sched* main_sched,
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
-ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
-                                           std::vector<ccl_sched*>& scheds,
-                                           const ccl_coll_param& coll_param) {
-    LOG_DEBUG("build topo allgatherv");
-    CCL_THROW_IF_NOT(scheds.size() == 1);
-
-    ccl_comm* comm = coll_param.comm;
-    ccl_sched* sched = scheds.front();
-    const ccl_datatype& dtype = coll_param.dtype;
-    const bool is_inplace = coll_param.is_inplace();
-
-    std::vector<ccl_buffer> recv_bufs;
-    std::vector<size_t> recv_offsets;
-    const std::vector<size_t>& recv_counts = coll_param.recv_counts;
-    ccl_coll_get_allgatherv_bufs_and_offsets(coll_param, recv_bufs, recv_offsets);
-
-    const size_t send_count = recv_counts[comm->rank()];
-    ccl_buffer send_buf;
-    if (is_inplace) {
-        send_buf = recv_bufs[comm->rank()];
-    }
-    else {
-        send_buf = ccl_buffer(coll_param.get_send_buf(), send_count * dtype.size());
-    }
-
+ccl::status ccl_coll_build_topo_allgatherv_fill(ccl_sched* sched,
+                                                const ccl_buffer send_buf,
+                                                const size_t send_count,
+                                                const std::vector<ccl_buffer>& recv_bufs,
+                                                const std::vector<size_t>& recv_counts,
+                                                const ccl_datatype& dtype,
+                                                ccl_comm* comm,
+                                                bool is_inplace) {
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
     ccl_comm* node_comm = comm->get_node_comm().get();
@@ -419,8 +427,35 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         sched->add_barrier();
     };
 
-    if (!is_single_node) {
-        if (!is_inplace) {
+    const bool is_pipeline_kernel =
+        ccl::global_data::env().allgatherv_monolithic_pipeline_kernel && even_comm->size() > 1;
+    if (!is_single_node || is_pipeline_kernel) {
+        if (!is_single_node) {
+            // pack data to be used for scaleout
+            std::vector<ccl_buffer> recv_bufs_r2r;
+            std::vector<size_t> recv_counts_r2r;
+            for (int i = 0; i < r2r_comm->size(); i++) {
+                const int global_rank = r2r_comm->get_global_rank(i);
+                recv_bufs_r2r.push_back(recv_bufs[global_rank]);
+                recv_counts_r2r.push_back(recv_counts[global_rank]);
+            }
+
+            ccl_coll_param coll_param_scaleout{ false };
+            coll_param_scaleout.ctype = ccl_coll_allgatherv;
+            coll_param_scaleout.send_buf = send_buf;
+            coll_param_scaleout.recv_scale_out_bufs = recv_bufs_r2r;
+            coll_param_scaleout.send_count = send_count;
+            coll_param_scaleout.recv_counts = recv_counts_r2r;
+            coll_param_scaleout.dtype = dtype;
+            coll_param_scaleout.comm = r2r_comm;
+
+            ccl::add_scaleout(sched, coll_param_scaleout, is_single_node, wait_events, out_event);
+            CCL_THROW_IF_NOT(out_event,
+                             "scaleout must be added to schedule, but it has not been added");
+        }
+
+        // local copy runs in parallel with scaleout
+        if (recv_counts[comm->rank()] && !is_inplace) {
             // copy data from my send_buf to my recv_buf
             copy_attr attr{};
             attr.direction = copy_direction::d2d;
@@ -435,50 +470,38 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
             parallel_copy_events.push_back(entry->entry_event);
         }
 
-        // pack data to be used for scaleout
-        std::vector<ccl_buffer> recv_bufs_r2r;
-        std::vector<size_t> recv_counts_r2r;
-        for (int i = 0; i < r2r_comm->size(); i++) {
-            const int global_rank = r2r_comm->get_global_rank(i);
-            recv_bufs_r2r.push_back(recv_bufs[global_rank]);
-            recv_counts_r2r.push_back(recv_counts[global_rank]);
+        // make sure scaleout of even comm ranks is finished
+        // so that we can start to read from them
+        if (!is_single_node) {
+            clear_and_push_back(wait_events, out_event);
+            ccl::add_comm_barrier(sched, even_comm, wait_events, out_event);
+            clear_and_push_back(wait_events, out_event);
         }
 
-        ccl_coll_entry_param coll_param_scaleout{};
-        coll_param_scaleout.ctype = ccl_coll_allgatherv;
-        coll_param_scaleout.send_buf = send_buf;
-        coll_param_scaleout.recv_bufs = recv_bufs_r2r;
-        coll_param_scaleout.send_count = send_count;
-        coll_param_scaleout.recv_counts = recv_counts_r2r.data();
-        coll_param_scaleout.dtype = dtype;
-        coll_param_scaleout.comm = r2r_comm;
-
-        ccl::add_scaleout(sched, coll_param_scaleout, is_single_node, wait_events, out_event);
-        CCL_THROW_IF_NOT(out_event,
-                         "scaleout must be added to schedule, but it has not been added");
-        clear_and_push_back(wait_events, out_event);
-
-        ccl::add_comm_barrier(sched, even_comm, wait_events, out_event);
-        clear_and_push_back(wait_events, out_event);
+        size_t hint_index = 0;
         auto recv_send_peers = [&](ccl_comm* recv_comm,
                                    ccl_comm* send_comm,
-                                   size_t scaleout_offset = 0,
-                                   bool is_inplace = false) {
+                                   bool is_scaleout = false,
+                                   size_t scaleout_offset = 0) {
             for (int peer_idx = 1; peer_idx < recv_comm->size(); peer_idx++) {
                 // copy data from all peers in even_comm
                 const int peer_rank = (recv_comm->rank() + peer_idx) % recv_comm->size();
                 CCL_THROW_IF_NOT(recv_comm->rank() != peer_rank, "Do not copy from own rank");
                 const int global_rank =
                     (recv_comm->get_global_rank(peer_rank) + scaleout_offset) % comm->size();
+                if (recv_counts[global_rank] == 0) {
+                    continue;
+                }
+
                 copy_attr attr{};
                 attr.peer_rank = peer_rank;
-                if (is_inplace)
+                if (is_scaleout)
                     attr.peer_buf_idx = recv_buf_idx_start + global_rank;
                 else
                     attr.peer_buf_idx = send_buf_idx;
                 attr.direction = copy_direction::c2c;
                 attr.map_comm = recv_comm;
-                attr.hint_queue_index = parallel_copy_events.size();
+                attr.hint_queue_index = hint_index++;
                 auto entry = entry_factory::create<ze_copy_entry>(sched,
                                                                   ccl_buffer(),
                                                                   recv_bufs[global_rank],
@@ -486,10 +509,10 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
                                                                   dtype,
                                                                   attr,
                                                                   wait_events);
-                parallel_copy_events.push_back(entry->entry_event);
 
                 // do not do mdfi copy if only one tile is used
                 if (send_comm->size() == 1) {
+                    parallel_copy_events.push_back(entry->entry_event);
                     continue;
                 }
 
@@ -513,31 +536,59 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         };
 
         size_t node_offset = 0;
-        // in case of scaleout, data is already copied to recv_buf and we can use in_place
-        bool is_use_inplace = true;
-        // TODO: if we ever allow this branch of the algo to run in is_single_node mode, this is the condition required here: "is_inplace || !is_single_node"
 
-        for (int r2r_rank = 0; r2r_rank < r2r_comm->size(); r2r_rank++) {
+        for (int r2r_idx = 0; r2r_idx < r2r_comm->size(); r2r_idx++) {
+            const int r2r_rank = (r2r_comm->rank() + r2r_idx) % r2r_comm->size();
+
+            // in case of scaleout data from other ranks, it is
+            // already available in recv_buf and we can use in_place
+            // but own copy from send_buf to recv_buf is going on
+            const bool is_scaleout = r2r_rank != r2r_comm->rank();
+
             // copy data from even_comm peers (xelink) that they recieved during scaleout
             // and write the copied data to pair_comm peer (mdfi)
-            recv_send_peers(even_comm, pair_comm, node_offset, is_use_inplace);
-            node_offset += node_comm->size();
-
-            // do not do mdfi copy if only one tile is used
-            if (pair_comm->size() == 1) {
-                continue;
+            if (is_pipeline_kernel) {
+                // pipelined kernel that copies from even_comm peers using xelink and to pair_comm peer using MDFI
+                auto entry = entry_factory::create<ze_a2a_allgatherv_entry>(
+                    sched,
+                    send_buf,
+                    send_count,
+                    recv_bufs,
+                    recv_counts,
+                    dtype,
+                    even_comm,
+                    wait_events,
+                    send_buf_idx,
+                    0 /* pair_comm_offset */,
+                    is_pipeline_kernel,
+                    pair_comm,
+                    true /* is_separate_block_handles */,
+                    is_scaleout,
+                    node_offset);
+                parallel_copy_events.push_back(entry->entry_event);
             }
+            else {
+                recv_send_peers(even_comm, pair_comm, is_scaleout, node_offset);
+            }
+            node_offset += node_comm->size();
 
-            // write the data recieved during scaleout to pair_comm peer (mdfi)
-            int send_rank = (pair_comm->rank() + 1) % pair_comm->size();
+            // write the data recieved during scaleout or own data to pair_comm peer (mdfi)
+            const int send_rank = (pair_comm->rank() + 1) % pair_comm->size();
             copy_attr attr_send{};
             attr_send.peer_rank = send_rank;
             const int global_rank = r2r_comm->get_global_rank(r2r_rank);
+
+            // no mdfi copy if data count is zero or only one tile is used
+            if (recv_counts[global_rank] == 0 || pair_comm->size() == 1) {
+                continue;
+            }
+
             attr_send.peer_buf_idx = recv_buf_idx_start + global_rank;
             attr_send.direction = copy_direction::t2t;
             attr_send.map_comm = pair_comm;
+            ccl_buffer in_buf = is_scaleout ? recv_bufs[global_rank] : send_buf;
             auto entry_send = entry_factory::create<ze_copy_entry>(sched,
-                                                                   recv_bufs[global_rank],
+                                                                   in_buf,
                                                                    ccl_buffer(),
                                                                    recv_counts[global_rank],
                                                                    dtype,
@@ -545,63 +596,16 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
                                                                    wait_events);
 
             parallel_copy_events.push_back(entry_send->entry_event);
+
+            // performance is going lower when we add a lot of copies in parallel
+            // and making the copies into stages seem to improve performance
+            if (!is_pipeline_kernel && pair_comm->size() > 1) {
+                add_sched_barrier_for_parallel_copies();
+            }
         }
         add_sched_barrier_for_parallel_copies();
         ccl::add_comm_barrier(sched, pair_comm, wait_events, out_event);
 
-        entry_factory::create<execute_cmdlists_entry>(sched);
-
-        return ccl::status::success;
-    }
-    else if (ccl::global_data::env().allgatherv_monolithic_pipeline_kernel &&
-             even_comm->size() > 1) {
-        // pipeline is not possible when there is no xelink neighbor in even_comm
-
-        // copy local data to pair_comm neighbor
-        if (pair_comm->size() > 1) {
-            // A pair_comm barrier avoids writing data to the peer before the peer starts the collective
-            ccl::add_comm_barrier(sched, pair_comm, wait_events, out_event);
-            clear_and_push_back(wait_events, out_event);
-
-            size_t peer_pair_rank = (pair_comm->rank() + 1) % pair_comm->size();
-            copy_attr attr{};
-            attr.peer_rank = peer_pair_rank;
-            attr.peer_buf_idx = recv_buf_idx_start + comm->rank();
-            attr.direction = copy_direction::t2t;
-            attr.map_comm = pair_comm;
-            // copy send buffer to the pair_comm tile recv buffer
-            auto entry_pair = entry_factory::create<ze_copy_entry>(
-                sched, send_buf, ccl_buffer(), recv_counts[comm->rank()], dtype, attr, wait_events);
-            clear_and_push_back(wait_events, entry_pair->entry_event);
-        }
-
-        // Need to set an even_comm barrier to avoid accessing peer's buffers before they start the collective
-        // TODO: understand a2a_allgatherv_entry so we can maybe reduce even_comm to a smaller comm?
-        ccl::add_comm_barrier(sched, even_comm, wait_events, out_event);
-        clear_and_push_back(wait_events, out_event);
-
-        // pipelined kernel that copies from even_comm peers using xelink and to pair_comm peer using MDFI
-        auto entry =
-            entry_factory::create<ze_a2a_allgatherv_entry>(sched,
-                                                           send_buf,
-                                                           send_count,
-                                                           recv_bufs,
-                                                           recv_counts,
-                                                           dtype,
-                                                           even_comm,
-                                                           wait_events,
-                                                           send_buf_idx,
-                                                           0 /* pair_comm_offset */,
-                                                           true /* is_monolithic_pipeline */,
-                                                           pair_comm);
-
-        clear_and_push_back(wait_events, entry->entry_event);
-        sched->add_barrier();
-        // TODO: understand barrier logic and see whether a smaller communicator can be used
-        ccl::add_comm_barrier(sched, node_comm, wait_events, out_event);
-
-        entry_factory::create<execute_cmdlists_entry>(sched);
-
         return ccl::status::success;
     }
 
@@ -618,6 +622,10 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
     //ccl::global_data::env().ze_max_copy_queues /* here must be real queue count */ >= even_comm_size-1) or unspecified;
 
     auto send_to_peers = [&](ccl_comm* comm, ccl_buffer in_buf, size_t count, size_t peer_buf_idx) {
+        if (count == 0) {
+            return;
+        }
+
         for (int peer_idx = 0; peer_idx < comm->size() - 1; peer_idx++) {
             const int peer_rank = (comm->rank() + peer_idx + 1) % comm->size();
             CCL_THROW_IF_NOT(comm->rank() != peer_rank);
@@ -628,7 +636,14 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
             const bool use_c2c_direction = (comm == even_comm) || can_use_small_msg_optimization;
             attr.direction = (use_c2c_direction) ? copy_direction::c2c : copy_direction::d2d;
             attr.map_comm = comm;
-            attr.hint_queue_index = (peer_idx + 1) * 2;
+
+            auto copy_engine_idx = (peer_idx + 1) * 2;
+            if (ccl::global_data::env().type2_mode == ccl::type2_tune_mode::detected ||
+                ccl::global_data::env().type2_mode == ccl::type2_tune_mode::on) {
+                copy_engine_idx = peer_idx * 2;
+            }
+
+            attr.hint_queue_index = copy_engine_idx;
             auto entry = entry_factory::create<ze_copy_entry>(
                 sched, in_buf, ccl_buffer(), count, dtype, attr, wait_events);
             parallel_copy_events.push_back(entry->entry_event);
@@ -640,12 +655,17 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
             const int peer_rank = (comm->rank() + peer_idx + 1) % comm->size();
             CCL_THROW_IF_NOT(comm->rank() != peer_rank);
             const int global_rank = comm->get_global_rank(peer_rank);
+            if (recv_counts[global_rank] == 0) {
+                continue;
+            }
+
             copy_attr attr{};
             attr.peer_rank = peer_rank;
             attr.peer_buf_idx = send_buf_idx;
             const bool use_c2c_direction = (comm == even_comm) || can_use_small_msg_optimization;
             attr.direction = (use_c2c_direction) ? copy_direction::c2c : copy_direction::d2d;
             attr.map_comm = comm;
+            // the perf is worse with peer_idx * 2 on smc
             attr.hint_queue_index = (peer_idx + 1) * 2;
             auto entry = entry_factory::create<ze_copy_entry>(sched,
                                                               ccl_buffer(),
@@ -658,7 +678,7 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         }
     };
 
-    const bool do_self_copy = !is_inplace;
+    const bool do_self_copy = !is_inplace && (recv_counts[comm->rank()] > 0);
     if (do_self_copy) {
         /* copy data from my send_buf to my recv_buf */
         copy_attr attr{};
@@ -730,10 +750,6 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
         /* Large scale algorithm: step 1 & 2. intra-card copy */
         LOG_DEBUG("topo/scale_up/intra: copy to self from peers");
 
-        // Need to make sure other peers in pair_comm have entered the collective
-        ccl::add_comm_barrier(sched, pair_comm, wait_events, out_event);
-        clear_and_push_back(wait_events, out_event);
-
         if (!is_lead_rank && !ccl::global_data::env().enable_ze_bidir_algo) {
             ccl::add_comm_barrier(sched, pair_comm, wait_events, out_event);
             clear_and_push_back(wait_events, out_event);
@@ -763,7 +779,112 @@ ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
     ccl_comm* barrier_comm = (is_large_scale_algorithm) ? even_comm : pair_comm;
     ccl::add_comm_barrier(sched, barrier_comm, wait_events, out_event);
 
-    entry_factory::create<execute_cmdlists_entry>(sched);
+    return ccl::status::success;
+}
+
+ccl::status ccl_coll_build_topo_allgatherv(ccl_sched* main_sched,
+                                           std::vector<ccl_sched*>& scheds,
+                                           const ccl_coll_param& coll_param) {
+    size_t chunk_count = ccl::global_data::env().allgatherv_pipe_chunk_count;
+    bool is_pipe = chunk_count > 0 && ccl::global_data::env().enable_ze_single_list;
+
+    CCL_THROW_IF_NOT(
+        scheds.size() == 1, "size of schedule list must be one, but is ", scheds.size());
+    ccl_sched* sched = scheds.front();
+
+    ccl_comm* comm = coll_param.comm;
+    const bool is_inplace = coll_param.is_inplace();
+    const ccl_datatype& dtype = coll_param.dtype;
+
+    std::vector<ccl_buffer> recv_bufs{};
+    const std::vector<size_t>& recv_counts = coll_param.recv_counts;
+    ccl_coll_get_allgatherv_bufs(coll_param, recv_bufs);
+
+    const size_t send_count = recv_counts[comm->rank()];
+    ccl_buffer send_buf;
+    if (is_inplace) {
+        send_buf = recv_bufs[comm->rank()];
+    }
+    else {
+        send_buf = ccl_buffer(coll_param.get_send_buf(), send_count * dtype.size());
+    }
+
+    if (!is_pipe) {
+        // Fall back to topo algorithm without pipelining
+        LOG_DEBUG("build topo allgatherv - pipe allgatherv disabled");
+
+        ccl_coll_build_topo_allgatherv_fill(
+            sched, send_buf, send_count, recv_bufs, recv_counts, dtype, comm, is_inplace);
+
+        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+
+        return ccl::status::success;
+    }
+
+    LOG_DEBUG("build pipe allgatherv");
+
+    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+        entry_factory::create<subsched_entry>(
+            sched,
+            chunk_idx,
+            [comm,
+             is_inplace,
+             dtype,
+             send_buf,
+             send_count,
+             recv_bufs,
+             recv_counts,
+             chunk_idx,
+             chunk_count](ccl_sched* s) {
+                bool is_empty_total_size = true;
+                std::vector<ccl_buffer> chunked_recv_bufs(comm->size());
+                std::vector<size_t> chunked_recv_counts(comm->size(), 0);
+
+                for (int idx = 0; idx < comm->size(); idx++) {
+                    size_t main_chunk_count = recv_counts[idx] / chunk_count;
+                    size_t last_chunk_count = main_chunk_count + recv_counts[idx] % chunk_count;
+                    chunked_recv_counts[idx] =
+                        (chunk_idx == (chunk_count - 1)) ? last_chunk_count : main_chunk_count;
+                    chunked_recv_bufs[idx] =
+                        recv_bufs[idx] + chunk_idx * main_chunk_count * dtype.size();
+                    if (chunked_recv_counts[idx] > 0) {
+                        is_empty_total_size = false;
+                    }
+                }
+
+                const size_t main_chunked_send_count = send_count / chunk_count;
+                const size_t chunked_send_count = chunked_recv_counts[comm->rank()];
+                ccl_buffer chunked_send_buf{};
+                if (!is_inplace) {
+                    chunked_send_buf =
+                        send_buf + chunk_idx * main_chunked_send_count * dtype.size();
+                }
+                else {
+                    chunked_send_buf = chunked_recv_bufs[comm->rank()];
+                }
+
+                if (is_empty_total_size) {
+                    // TODO: ccl_coll_build_topo_allgatherv_fill should be able to handle 0-sized inputs!
+                    // TODO: Similarly for other collectives
+                    LOG_DEBUG("chunk_idx ",
+                              chunk_idx,
+                              " total size is empty. Not calling allgatherv for this chunk.");
+                    return ccl::status::success;
+                }
+                return ccl_coll_build_topo_allgatherv_fill(s,
+                                                           chunked_send_buf,
+                                                           chunked_send_count,
+                                                           chunked_recv_bufs,
+                                                           chunked_recv_counts,
+                                                           dtype,
+                                                           comm,
+                                                           is_inplace);
+            },
+            ("ALLGATHERV_PIPE" + std::to_string(chunk_idx)).c_str());
+        sched->add_barrier();
+    }
+
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/allreduce/allreduce.cpp b/src/coll/algorithms/allreduce/allreduce.cpp
index ec009d4bc..df001037e 100644
--- a/src/coll/algorithms/allreduce/allreduce.cpp
+++ b/src/coll/algorithms/allreduce/allreduce.cpp
@@ -39,6 +39,13 @@ ccl::status ccl_coll_build_direct_allreduce(ccl_sched* sched,
                                             ccl_comm* comm) {
     LOG_DEBUG("build direct allreduce");
 
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return ccl::status::success;
+    }
+
     entry_factory::create<allreduce_entry>(sched, send_buf, recv_buf, count, dtype, op, comm);
     return ccl::status::success;
 }
@@ -60,6 +67,14 @@ ccl::status ccl_coll_build_rabenseifner_allreduce(ccl_sched* sched,
     size_t dtype_size = dtype.size();
 
     comm_size = comm->size();
+
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return ccl::status::success;
+    }
+
     rank = comm->rank();
     ccl_buffer tmp_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
 
@@ -281,6 +296,14 @@ ccl::status ccl_coll_build_nreduce_allreduce(ccl_sched* sched,
     LOG_DEBUG("build nreduce allreduce");
 
     ccl::status status = ccl::status::success;
+
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return status;
+    }
+
     int comm_size = comm->size();
     int comm_rank = comm->rank();
     std::vector<size_t> elem_counts(comm_size);
@@ -422,6 +445,13 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
     int inplace = (send_buf == recv_buf) ? 1 : 0;
     LOG_DEBUG("build ring allreduce ", inplace ? "in-place" : "out-of-place");
 
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return ccl::status::success;
+    }
+
     CCL_THROW_IF_NOT(sched && send_buf && recv_buf,
                      "incorrect values, sched ",
                      sched,
@@ -431,7 +461,6 @@ ccl::status ccl_coll_build_ring_allreduce(ccl_sched* sched,
                      recv_buf);
 
     ccl::status status = ccl::status::success;
-
     ccl_coll_build_ring_reduce_scatter(sched, send_buf, recv_buf, count, dtype, op, comm);
 
     sched->add_barrier();
@@ -470,6 +499,13 @@ ccl::status ccl_coll_build_recursive_doubling_allreduce(ccl_sched* sched,
 
     ccl::status status = ccl::status::success;
 
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return status;
+    }
+
     int pof2, rem, comm_size, rank;
     int newrank, mask, newdst, dst;
 
@@ -627,7 +663,7 @@ static void ccl_allreduce_2d_add_reduce_scatter_allreduce_allgather(ccl_sched* s
     ccl_buffer sbuf = send_buf + chunk_idx * main_chunk_size * dtype_size;
     ccl_buffer rbuf = recv_buf + chunk_idx * main_chunk_size * dtype_size;
 
-    ccl_coll_build_reduce_scatter(sched, sbuf, rbuf, cnt, dtype, op, first_dim_comm, true);
+    ccl_coll_build_reduce_scatter(sched, sbuf, rbuf, cnt, dtype, op, first_dim_comm, false, true);
     sched->add_barrier();
 
     if (chunk_idx == (chunk_count - 1) || (chunk_count == 1)) {
@@ -709,6 +745,13 @@ ccl::status ccl_coll_build_2d_allreduce(ccl_sched* sched,
                                         ccl_comm* comm) {
     ccl::status status = ccl::status::success;
 
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return status;
+    }
+
     size_t chunk_count = ccl::global_data::env().allreduce_2d_chunk_count;
 
     bool switch_dims = ccl::global_data::env().allreduce_2d_switch_dims;
@@ -744,6 +787,7 @@ ccl::status ccl_coll_build_2d_allreduce(ccl_sched* sched,
 }
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+
 ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
                                                ccl_buffer send_buf,
                                                ccl_buffer recv_buf,
@@ -753,6 +797,13 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
                                                ccl_comm* comm) {
     LOG_DEBUG("build topo allreduce");
 
+    // count is the same for all ranks
+    // if one rank skips mpi collectives, all ranks skip
+    // this means we can safely skip all operations with zero count
+    if (count == 0) {
+        return ccl::status::success;
+    }
+
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
     ccl_comm* node_comm = comm->get_node_comm().get();
@@ -771,8 +822,34 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     bool use_reduce_scatter_pipeline =
         ccl::global_data::env().reduce_scatter_monolithic_pipeline_kernel &&
         even_comm->size() > 1 && pair_comm->size() > 1 && count >= (size_t)comm_size &&
-        is_multi_card &&
-        ccl::global_data::env().allgatherv_topo_read; // TODO: remove after refactoring allgatherv
+        is_multi_card && dtype != ccl::datatype::int8;
+
+    // allgatherv pipeline uses xelink read and mdfi write
+    const bool use_allgatherv_pipeline =
+        ccl::global_data::env().allgatherv_monolithic_pipeline_kernel && count >= (size_t)comm_size;
+
+    size_t base_count = count;
+    size_t pair_comm_offset = 0;
+    size_t pair_comm_offset_bytes = 0;
+
+    bool barrier_1s_handle_exchange = false;
+    if (ccl::global_data::env().enable_ze_bidir_algo && (base_count / pair_comm->size()) > 0) {
+        base_count = count / pair_comm->size();
+        pair_comm_offset = base_count * pair_comm->rank();
+        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
+
+        if (pair_comm->rank() == pair_comm->size() - 1)
+            base_count += count % pair_comm->size();
+    }
+    else if (pair_comm->rank() != ccl::global_data::env().kernel_1s_lead) {
+        barrier_1s_handle_exchange = true;
+    }
+
+    size_t main_block_count = base_count / even_comm_size;
+    size_t block_count = main_block_count;
+    if (even_comm->rank() == even_comm_size - 1) {
+        block_count += base_count % even_comm_size;
+    }
 
     // tmp buff for write mode reduce scatter
     // TODO: fix - write based reduce_scatter fails intermittently for int8. However, such
@@ -786,12 +863,18 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     // this entry combines reduce_scatter and allgather across even_comm
     // If write based copy is being used reduce_scatter, we should skip a2a_allreduce_entry
     // because that path uses read based copy instead
-    bool use_a2a_allreduce_entry =
-        is_single_node && !ccl::global_data::env().allgatherv_topo_read && !is_rs_write;
+    bool use_a2a_allreduce_entry = is_single_node &&
+                                   !ccl::global_data::env().allgatherv_topo_read && !is_rs_write &&
+                                   !use_reduce_scatter_pipeline && !use_allgatherv_pipeline;
 
     if (is_rs_write) {
-        int n_even_comm_sets = comm->size() / even_comm->size();
-        size_t tmp_buf_bytes = dtype.size() * count / n_even_comm_sets + count % n_even_comm_sets;
+        // local rank does not store its data in the tmp buffer, so skip 1 block
+        // under uneven division, the last block can have extra data, reflected in block_count
+        size_t tmp_buf_bytes = dtype.size() * ((even_comm->size() - 1) * block_count);
+        // workaround with dummy 1 byte allocation to still enable handle exchange when tmp bytes is 0
+        if (tmp_buf_bytes == 0)
+            tmp_buf_bytes = 1;
+
         ccl::alloc_param alloc_param(
             tmp_buf_bytes, ccl::buffer_type::ze, ccl::buffer_place::device);
         tmp_write_buf = sched->alloc_buffer(alloc_param);
@@ -848,10 +931,7 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     std::vector<ze_event_handle_t> wait_events{};
     ze_event_handle_t out_event{};
 
-    // TODO: remove this if() condition. We want to always use a single list
-    if (!use_reduce_scatter_pipeline) {
-        sched->try_enable_ze_single_list();
-    }
+    sched->try_enable_ze_single_list();
 
     ccl::add_handle_exchange(sched,
                              node_comm,
@@ -866,19 +946,7 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
     CCL_THROW_IF_NOT(comm_size % 2 == 0, "unexpected comm_size ", comm_size);
     CCL_THROW_IF_NOT(node_comm_size % 2 == 0, "unexpected node_comm_size ", node_comm_size);
 
-    size_t base_count = count;
-    size_t pair_comm_offset = 0;
-    size_t pair_comm_offset_bytes = 0;
-
-    if (ccl::global_data::env().enable_ze_bidir_algo && (base_count / pair_comm->size()) > 0) {
-        base_count = count / pair_comm->size();
-        pair_comm_offset = base_count * pair_comm->rank();
-        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
-
-        if (pair_comm->rank() == pair_comm->size() - 1)
-            base_count += count % pair_comm->size();
-    }
-    else if (pair_comm->rank() != ccl::global_data::env().kernel_1s_lead) {
+    if (barrier_1s_handle_exchange) {
         ccl::add_comm_barrier(
             sched, pair_comm, wait_events, out_event, ipc_event_pool, ipc_event_count++);
         CCL_THROW_IF_NOT(ipc_event_count <= max_ipc_event_count,
@@ -889,12 +957,6 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
         return ccl::status::success;
     }
 
-    size_t main_block_count = base_count / even_comm_size;
-    size_t block_count = main_block_count;
-    if (even_comm->rank() == even_comm_size - 1) {
-        block_count += base_count % even_comm_size;
-    }
-
     const size_t even_comm_offset = main_block_count * even_comm->rank();
     const size_t even_comm_offset_bytes = even_comm_offset * dtype.size();
     ccl_buffer even_comm_recv_buf{};
@@ -975,6 +1037,7 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
                                                          copy_attr(peer_rank,
                                                                    recv_buf_idx,
                                                                    copy_direction::t2t,
+                                                                   false, /*pt2pt_op*/
                                                                    pair_comm,
                                                                    pair_comm_offset,
                                                                    pair_comm_offset),
@@ -1102,13 +1165,14 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
         }
     }
 
-    ccl_coll_entry_param coll_param{ .ctype = ccl_coll_allreduce,
-                                     .send_buf = even_comm_recv_buf,
-                                     .recv_buf = even_comm_recv_buf,
-                                     .count = block_count,
-                                     .dtype = dtype,
-                                     .reduction = op,
-                                     .comm = r2r_comm };
+    ccl_coll_param coll_param{ false };
+    coll_param.ctype = ccl_coll_allreduce;
+    coll_param.send_buf = even_comm_recv_buf;
+    coll_param.recv_buf = even_comm_recv_buf;
+    coll_param.count = block_count;
+    coll_param.dtype = dtype;
+    coll_param.reduction = op;
+    coll_param.comm = r2r_comm;
 
     out_event = nullptr;
     ccl::add_scaleout(sched, coll_param, is_single_node, wait_events, out_event);
@@ -1116,16 +1180,11 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
         clear_and_push_back(wait_events, out_event);
     }
 
-    // allgatherv pipeline uses xelink read which is is_read_allgatherv=1
-    const bool use_allgatherv_pipeline =
-        ccl::global_data::env().allgatherv_monolithic_pipeline_kernel &&
-        count >= (size_t)comm_size && is_read_allgatherv;
-
     if (is_multi_card && !use_a2a_allreduce_entry) {
         LOG_DEBUG("topo/scale_up/inter: use ze_a2a_allgatherv");
-        // for multinode with allgatherv_read, use a comm_barrier to make sure all
+        // for multinode with xelink read, use a comm_barrier to make sure all
         // r2r scaleout within even_comm has finished so that remote reads are valid
-        if (!is_single_node && is_read_allgatherv) {
+        if (!is_single_node && (is_read_allgatherv || use_allgatherv_pipeline)) {
             ccl::add_comm_barrier(
                 sched, even_comm, wait_events, out_event, ipc_event_pool, ipc_event_count++);
             clear_and_push_back(wait_events, out_event);
@@ -1164,6 +1223,7 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
             attr.peer_rank = peer_pair_rank;
             attr.peer_buf_idx = recv_buf_idx;
             attr.direction = copy_direction::t2t;
+            attr.pt2pt_op = false;
             attr.map_comm = pair_comm;
             attr.in_buf_offset = pair_comm_offset + even_comm_offset;
             attr.out_buf_offset = pair_comm_offset + even_comm_offset;
@@ -1192,6 +1252,7 @@ ccl::status ccl_coll_build_topo_allreduce_fill(ccl_sched* sched,
                                                           copy_attr(peer_rank,
                                                                     recv_buf_idx,
                                                                     copy_direction::t2t,
+                                                                    false, /*pt2pt_op*/
                                                                     pair_comm,
                                                                     pair_comm_offset,
                                                                     pair_comm_offset),
@@ -1219,9 +1280,152 @@ ccl::status ccl_coll_build_topo_allreduce(ccl_sched* sched,
                                           const ccl_datatype& dtype,
                                           ccl::reduction op,
                                           ccl_comm* comm) {
-    ccl_coll_build_topo_allreduce_fill(sched, send_buf, recv_buf, count, dtype, op, comm);
+    // Note about cache lines and pipelining: The same cache line must contain
+    // a single chunk only.
+    //
+    // If the same cache line contains two chunks (or more), and we parallelize
+    // the instructions required for both chunks, a conflict (race condition)
+    // may appear between the copy-out for the scaleout portion and the
+    // reduce_scatter phase.
+    //
+    // The easiest way to avoid that race condition is to require that each
+    // cache line contains a single entry. If that is not the case, we must not
+    // parallelize the instructions for different chunks.
+
+    size_t chunk_count = ccl::global_data::env().allreduce_pipe_chunk_count;
+    bool is_pipe = chunk_count > 0 && ccl::global_data::env().enable_ze_single_list;
+
+    // TODO: why does oneCCL have CACHELINE_SIZE *and* CCL_KERNEL_MEM_ALIGN?
+    size_t memalign = ccl::global_data::env().kernel_mem_align;
+    size_t buf_size_bytes = count * dtype.size();
+
+    // First, determine if we need to fallback to non-pipelined algorightm.
+    // Such a fallback may happen in cases such as (1) the user requests it,
+    // (2) message fits into a cache line, or (3) the cache line size is not
+    // divisible by the data type size.
+
+    size_t number_of_cache_lines_per_chunk =
+        !is_pipe ? 1 : std::max(memalign, buf_size_bytes / chunk_count) / memalign;
+    size_t main_chunk_size_bytes = memalign * number_of_cache_lines_per_chunk;
+
+    bool is_dtype_non_divisible = main_chunk_size_bytes % dtype.size();
+    bool is_msg_smaller_than_cache_line = buf_size_bytes <= main_chunk_size_bytes;
+
+    bool is_multiworker =
+        ccl::global_data::env().ze_multi_workers && ccl::global_data::env().worker_count > 1;
+
+    if (!is_pipe || is_dtype_non_divisible || is_msg_smaller_than_cache_line || is_multiworker) {
+        // Fall back to topo algorithm without pipelining
+
+        if (!is_pipe) {
+            LOG_DEBUG("Pipelining code disabled");
+        }
+        else if (is_dtype_non_divisible) {
+            LOG_INFO("Running without pipelining because datatype size (",
+                     dtype.size(),
+                     ") is not divisible by cache line size (",
+                     memalign,
+                     ")");
+        }
+        else if (is_msg_smaller_than_cache_line) {
+            LOG_INFO("Running without pipelining because message size (",
+                     buf_size_bytes,
+                     ") is smaller than a cache line (",
+                     memalign,
+                     ") and main_chunk_size_bytes (",
+                     main_chunk_size_bytes,
+                     ")");
+        }
+        else if (is_multiworker) {
+            LOG_INFO(
+                "Running without pipelining because ze_multi_workers was requested with more than one worker");
+        }
+        else {
+            CCL_THROW("Unexpected fallback to non-pipe code");
+        }
+
+        ccl_coll_build_topo_allreduce_fill(sched, send_buf, recv_buf, count, dtype, op, comm);
+
+        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+
+        return ccl::status::success;
+    }
+
+    LOG_DEBUG("build pipe allreduce");
+
+    size_t main_chunk_count = main_chunk_size_bytes / dtype.size();
+
+    // Need to re-calculate chunk_count after main_chunk_size_bytes calculation
+    // with cache alignment in mind.
+    chunk_count = count / main_chunk_count;
+    size_t last_chunk_count = main_chunk_count + (count % main_chunk_count);
+
+    sched->try_enable_ze_single_list();
+    auto sync_obj = std::make_shared<sync_object>(chunk_count);
+    bool is_parallelizable_chunks = true;
+
+    for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
+        size_t chunk_offset = chunk_idx * main_chunk_count * dtype.size();
+        ccl_buffer sbuf = send_buf + chunk_offset;
+        ccl_buffer rbuf = recv_buf + chunk_offset;
+        size_t this_chunk_count =
+            (chunk_idx == (chunk_count - 1)) ? last_chunk_count : main_chunk_count;
+
+        if (this_chunk_count || (count == 0 && chunk_idx == 0)) {
+            entry_factory::create<subsched_entry>(
+                sched,
+                chunk_idx,
+                [sched, sbuf, rbuf, this_chunk_count, dtype, op, comm, sync_obj](ccl_sched* s) {
+                    s->inherit_ze_managers_from(sched);
+                    s->set_init_ze_hook_sync_obj(sync_obj);
+                    s->set_ze_commands_bypass_flag(false);
+
+                    ccl_coll_build_topo_allreduce_fill(
+                        s, sbuf, rbuf, this_chunk_count, dtype, op, comm);
+                },
+                ("ALLREDUCE_PIPE" + std::to_string(chunk_idx)).c_str());
+        }
+        if (chunk_idx > 0) {
+            auto ptr = reinterpret_cast<uintptr_t>(rbuf.get_ptr());
+            auto prev_chunk_last_cache_line = (ptr - 1) / memalign;
+            auto this_chunk_first_cache_line = ptr / memalign;
+
+            if (prev_chunk_last_cache_line == this_chunk_first_cache_line) {
+                // WARNING: previous chunk has part of this chunk's first cache
+                // line. Cannot use pipelining. However, since this is a
+                // "local" decision (i.e., other ranks may decide differently),
+                // we still need to apply chunking. However, we will run one
+                // chunk at a time, without parallelizing them.
+                // Another way to have implemented this would be to link the
+                // last task of the prev chunk with the first of this chunk
+                // with an event.
+                is_parallelizable_chunks = false;
+            }
+        }
+    }
+
+    static bool is_chunk_memalign_warning_printed{};
+    if (!is_parallelizable_chunks && !is_chunk_memalign_warning_printed) {
+        is_chunk_memalign_warning_printed = true;
+        LOG_WARN(
+            "[allreduce pipelining]: For best performance, (i) chunk size should be a multiple of a cache line (",
+            memalign,
+            " bytes), and (ii) buffers in all ranks should be aligned to ",
+            memalign);
+    }
+
+    if (!is_parallelizable_chunks) {
+        ccl::global_data::get()
+            .metrics_profiler->allreduce_pipe_nonparallel_calls_per_count[count]++;
+    }
+    else {
+        ccl::global_data::get().metrics_profiler->allreduce_pipe_parallel_calls_per_count[count]++;
+    }
 
-    entry_factory::create<execute_cmdlists_entry>(sched);
+    entry_factory::create<ze_execute_cmdlists_on_start_entry>(
+        sched,
+        sync_obj,
+        is_parallelizable_chunks ? submit_ze_commands_in_subsched_entries : nullptr);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/alltoallv.cpp b/src/coll/algorithms/alltoallv.cpp
index 02a9012ec..efa3c73ed 100644
--- a/src/coll/algorithms/alltoallv.cpp
+++ b/src/coll/algorithms/alltoallv.cpp
@@ -86,8 +86,10 @@ ccl::status ccl_coll_calculate_alltoallv_counts(const ccl_coll_param& coll_param
         recv_offsets[idx] = recv_offsets[idx - 1] + recv_counts[idx - 1] * dtype_size;
     }
 
-    total_send_count = std::accumulate(send_counts.begin(), send_counts.end(), 0);
-    total_recv_count = std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
+    total_send_count =
+        std::accumulate(send_counts.begin(), send_counts.end(), ccl::utils::initial_count_value);
+    total_recv_count =
+        std::accumulate(recv_counts.begin(), recv_counts.end(), ccl::utils::initial_count_value);
 
     total_send_bytes = total_send_count * dtype_size;
     total_recv_bytes = total_recv_count * dtype_size;
@@ -467,8 +469,7 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
             return;
         }
         copy_attr attr{};
-        attr.hint_queue_index = parallel_copy_events.size();
-        attr.direction = copy_direction::c2c;
+        attr.direction = copy_direction::d2d;
         auto entry = entry_factory::create<ze_copy_entry>(
             sched, send, recv, count, dtype, attr, wait_events);
         parallel_copy_events.push_back(entry->entry_event);
@@ -515,14 +516,14 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
         }
 
         // preparation for host alltoall coll
-        ccl_coll_entry_param host_coll_param{ .ctype = ccl_coll_alltoallv,
-                                              .send_bufs = in_bufs,
-                                              .recv_bufs = out_bufs,
-                                              .send_counts = tmp_send_counts.data(),
-                                              .recv_counts = tmp_recv_counts.data(),
-                                              .dtype = dtype,
-                                              .comm = comm };
-
+        ccl_coll_param host_coll_param{ false };
+        host_coll_param.ctype = ccl_coll_alltoallv;
+        host_coll_param.send_scale_out_bufs = in_bufs;
+        host_coll_param.recv_scale_out_bufs = out_bufs;
+        host_coll_param.send_counts = tmp_send_counts;
+        host_coll_param.recv_counts = tmp_recv_counts;
+        host_coll_param.dtype = dtype;
+        host_coll_param.comm = comm;
         host_coll_param.hint_algo.alltoallv = ccl_coll_alltoallv_direct;
 
         // do alltoall on the host (scale out) using global comm
@@ -533,7 +534,6 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
         // returned back saved value
         ccl::global_data::env().ze_multi_workers = ze_multi_workers_saved;
     };
-
     // TODO: enable alltoallv vectorized support for monolithic kernel MLSL-1371
     bool can_use_monolithic = true;
     for (int idx = 0; idx < comm_size; idx++) {
@@ -613,7 +613,12 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
                     attr.peer_rank = peer_rank;
                     attr.peer_buf_idx = start_buf_idx + offset;
                     attr.map_comm = comm;
-                    attr.hint_queue_index = parallel_copy_events.size();
+                    auto copy_engine_idx = card_idx * 2;
+                    if (ccl::global_data::env().type2_mode == ccl::type2_tune_mode::detected ||
+                        ccl::global_data::env().type2_mode == ccl::type2_tune_mode::on) {
+                        copy_engine_idx = parallel_copy_events.size() * 2;
+                    }
+                    attr.hint_queue_index = copy_engine_idx;
                     attr.direction = copy_direction::c2c;
 
                     if (!is_single_node) {
@@ -723,7 +728,7 @@ ccl::status ccl_coll_build_topo_alltoallv(ccl_sched* main_sched,
     }
     ccl::add_comm_barrier(sched, node_comm, wait_events, out_event);
 
-    entry_factory::create<execute_cmdlists_entry>(sched);
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/bcast.cpp b/src/coll/algorithms/bcast.cpp
index c37d8e7a4..670d7b9a2 100644
--- a/src/coll/algorithms/bcast.cpp
+++ b/src/coll/algorithms/bcast.cpp
@@ -245,6 +245,10 @@ ccl::status ccl_coll_build_topo_bcast(ccl_sched* sched,
                                       ccl_comm* comm) {
     LOG_DEBUG("build topo bcast");
 
+    if (count == 0) {
+        return ccl::status::success;
+    }
+
     ccl_comm* node_comm = comm->get_node_comm().get();
 
     const std::vector<ze_handle_exchange_entry::mem_desc_t> buffers{
@@ -255,6 +259,8 @@ ccl::status ccl_coll_build_topo_bcast(ccl_sched* sched,
     std::vector<ze_event_handle_t> wait_events;
     ze_event_handle_t out_event;
 
+    sched->try_enable_ze_single_list();
+
     ccl::add_handle_exchange(sched, node_comm, wait_events, out_event, buffers);
     clear_and_push_back(wait_events, out_event);
 
@@ -271,7 +277,7 @@ ccl::status ccl_coll_build_topo_bcast(ccl_sched* sched,
 
     ccl::add_comm_barrier(sched, node_comm, wait_events, out_event);
 
-    entry_factory::create<execute_cmdlists_entry>(sched);
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/recv.cpp b/src/coll/algorithms/recv.cpp
new file mode 100644
index 000000000..45550c140
--- /dev/null
+++ b/src/coll/algorithms/recv.cpp
@@ -0,0 +1,113 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+/*
+*
+*  (C) 2001 by Argonne National Laboratory.
+*      See COPYRIGHT in top-level directory.
+*/
+
+#include "coll/algorithms/algorithms.hpp"
+#include "coll/coll_util.hpp"
+#include "common/utils/utils.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
+
+ccl::status ccl_coll_build_direct_recv(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int peer_rank,
+                                       ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    LOG_DEBUG("build direct RECV: ", comm->rank(), " count: ", count, ", peer_rank: ", peer_rank);
+    CCL_THROW_IF_NOT(peer_rank > CCL_INVALID_PEER_RANK_IDX && peer_rank < comm->size(),
+                     "invalid peer_rank: ",
+                     peer_rank);
+    entry_factory::create<recv_entry>(sched, buf, count, dtype, peer_rank, comm);
+
+    return status;
+}
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_recv(ccl_sched* sched,
+                                     ccl_buffer buf,
+                                     size_t count,
+                                     const ccl_datatype& dtype,
+                                     int peer_rank,
+                                     ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    ccl_comm* node_comm = comm->get_node_comm().get();
+    auto node_peer_rank = node_comm->get_rank_from_global(peer_rank);
+    auto node_curr_rank = node_comm->rank();
+
+    LOG_DEBUG("build topo RECV buf: ", buf.get_ptr(), " and peer_rank: ", node_peer_rank);
+    CCL_THROW_IF_NOT(
+        node_peer_rank > CCL_INVALID_PEER_RANK_IDX && node_peer_rank < node_comm->size(),
+        "invalid peer_rank: ",
+        node_peer_rank,
+        " for recv op");
+    const std::vector<ze_handle_exchange_entry::mem_desc_t> buffer{
+        { buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 0 idx
+    };
+
+    std::vector<ze_event_handle_t> wait_events{};
+    ze_event_handle_t out_event{};
+
+    ccl::utils::pt2pt_handle_exchange_info info{ node_peer_rank,
+                                                 ccl::utils::pt2pt_handle_exchange_role::receiver };
+    if (!ccl::global_data::env().ze_pt2pt_read) {
+        info.role = ccl::utils::pt2pt_handle_exchange_role::sender;
+    }
+
+    ccl::add_handle_exchange(
+        sched, node_comm, wait_events, out_event, buffer, ccl_comm::invalid_rank, nullptr, 0, info);
+    LOG_DEBUG("build RECV: add_handle_exchange is done");
+
+    ccl_sched_id_t pt2pt_ack_tag = node_comm->get_atl_comm()->tag_creator->get_pt2pt_ack_tag();
+
+    if (ccl::global_data::env().ze_pt2pt_read) {
+        LOG_DEBUG("build RECV: read mode is enabled");
+        entry_factory::create<copy_entry>(
+            sched,
+            ccl_buffer(),
+            buf,
+            count,
+            dtype,
+            copy_attr(node_peer_rank, 0, copy_direction::d2d, true /*pt2pt_op*/));
+        LOG_DEBUG("build RECV: copy_entry is done");
+
+        uint64_t ack_tag = node_comm->get_atl_comm()->tag_creator->create(
+            node_curr_rank, node_comm->get_comm_id(), pt2pt_ack_tag, sched->get_op_id());
+        ccl::utils::send_ack_to_peer(node_comm->get_atl_comm(), ack_tag, node_peer_rank);
+    }
+    else {
+        uint64_t ack_tag = node_comm->get_atl_comm()->tag_creator->create(
+            node_peer_rank, node_comm->get_comm_id(), pt2pt_ack_tag, sched->get_op_id());
+        ccl::utils::recv_ack_from_peer(node_comm->get_atl_comm(), ack_tag, node_peer_rank);
+        LOG_DEBUG("build RECV: send_ack_to_peer is done with tag: ",
+                  ack_tag,
+                  ", comm_rank: ",
+                  comm->rank(),
+                  ", peer_rank: ",
+                  node_peer_rank);
+    }
+
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+    return status;
+}
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/algorithms/reduce.cpp b/src/coll/algorithms/reduce.cpp
index 640ce9d15..41c5ec568 100644
--- a/src/coll/algorithms/reduce.cpp
+++ b/src/coll/algorithms/reduce.cpp
@@ -65,6 +65,9 @@ ccl::status ccl_coll_build_direct_reduce(ccl_sched* sched,
                                          ccl_comm* comm) {
     LOG_DEBUG("build direct reduce");
 
+    if (count == 0)
+        return ccl::status::success;
+
     entry_factory::create<reduce_entry>(
         sched, send_buf, recv_buf, count, dtype, reduction, root, comm);
     return ccl::status::success;
@@ -82,6 +85,9 @@ ccl::status ccl_coll_build_rabenseifner_reduce(ccl_sched* sched,
 
     ccl::status status = ccl::status::success;
 
+    if (count == 0)
+        return ccl::status::success;
+
     int i, j, comm_size, rank, local_root, pof2;
     int rem, dst, new_rank, new_dst, mask, send_idx, recv_idx, last_idx;
     int send_cnt, recv_cnt, newroot, newdst_tree_root, newroot_tree_root;
@@ -366,6 +372,9 @@ ccl::status ccl_coll_build_ring_reduce(ccl_sched* sched,
     int comm_size = comm->size();
     ccl::status status = ccl::status::success;
 
+    if (count == 0)
+        return status;
+
     if (rank != local_root) {
         recv_buf = sched->alloc_buffer({ count * dtype_size, send_buf });
     }
@@ -513,6 +522,32 @@ ccl::status ccl_coll_build_binomial_reduce(ccl_sched* sched,
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 
+void get_counts_n_offsets_bidir(size_t count,
+                                size_t pair_comm_size,
+                                size_t pair_comm_rank,
+                                size_t even_comm_size,
+                                size_t even_comm_rank,
+                                const ccl_datatype& dtype,
+                                size_t& base_count,
+                                size_t& pair_comm_offset,
+                                size_t& pair_comm_offset_bytes,
+                                size_t& main_block_count,
+                                size_t& block_count,
+                                size_t& even_comm_offset_bytes) {
+    base_count = count / pair_comm_size;
+    pair_comm_offset = base_count * pair_comm_rank;
+    pair_comm_offset_bytes = pair_comm_offset * dtype.size();
+
+    if (pair_comm_rank == pair_comm_size - 1)
+        base_count += count % pair_comm_size;
+    main_block_count = base_count / even_comm_size;
+    block_count = main_block_count;
+    if (even_comm_rank == even_comm_size - 1) {
+        block_count += base_count % even_comm_size;
+    }
+    even_comm_offset_bytes = main_block_count * even_comm_rank * dtype.size();
+}
+
 ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                                        ccl_buffer send_buf,
                                        ccl_buffer recv_buf,
@@ -523,13 +558,15 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                                        ccl_comm* comm) {
     LOG_DEBUG("build gpu topo reduce");
 
+    if (count == 0)
+        return ccl::status::success;
+
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
     ccl_comm* node_comm = comm->get_node_comm().get();
     ccl_comm* r2r_comm = comm->get_r2r_comm().get();
 
     int comm_size = comm->size();
-    int pair_comm_size = pair_comm->size();
     int even_comm_size = even_comm->size();
     int node_comm_size = node_comm->size();
 
@@ -543,12 +580,36 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
 
     // allocate tmp buff for write
     ccl_buffer tmp_write_buf;
-    bool is_rs_write =
-        !ccl::global_data::env().reduce_scatter_topo_read && !use_read_write_pipeline &&
-        !ccl::global_data::env().reduce_scatter_monolithic_kernel && dtype != ccl::datatype::int8;
+    bool is_rs_write = !ccl::global_data::env().reduce_scatter_topo_read &&
+                       !use_read_write_pipeline &&
+                       !ccl::global_data::env().reduce_scatter_monolithic_kernel &&
+                       dtype != ccl::datatype::int8 && even_comm_size > 1;
+
+    size_t base_count = count;
+    size_t pair_comm_offset = 0;
+    size_t pair_comm_offset_bytes = 0;
+    size_t main_block_count;
+    size_t block_count;
+    size_t even_comm_offset_bytes;
+    get_counts_n_offsets_bidir(count,
+                               pair_comm->size(),
+                               pair_comm->rank(),
+                               even_comm->size(),
+                               even_comm->rank(),
+                               dtype,
+                               base_count,
+                               pair_comm_offset,
+                               pair_comm_offset_bytes,
+                               main_block_count,
+                               block_count,
+                               even_comm_offset_bytes);
+
     if (is_rs_write) {
-        size_t buf_bytes = dtype.size() * count;
-        size_t tmp_buf_bytes = buf_bytes / pair_comm_size;
+        size_t tmp_buf_bytes = 0;
+        tmp_buf_bytes = dtype.size() * ((even_comm_size - 1) * (block_count));
+        // workaround with dummy 1 byte allocation to still enable handle exchange when tmp bytes is 0
+        if (tmp_buf_bytes == 0)
+            tmp_buf_bytes = 1;
         ccl::alloc_param alloc_param(
             tmp_buf_bytes, ccl::buffer_type::ze, ccl::buffer_place::device);
         tmp_write_buf = sched->alloc_buffer(alloc_param);
@@ -615,10 +676,9 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                              wait_events,
                              out_event,
                              in_buffers,
-                             ccl_comm::invalid_rank,
+                             ccl_comm::invalid_rank /*skip_rank*/,
                              ipc_event_pool,
                              ipc_event_count++);
-    clear_and_push_back(wait_events, out_event);
 
     CCL_THROW_IF_NOT(comm_size % 2 == 0, "unexpected comm_size ", comm_size);
     CCL_THROW_IF_NOT(node_comm_size % 2 == 0, "unexpected node_comm_size ", node_comm_size);
@@ -635,23 +695,6 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
         clear_and_push_back(wait_events, out_event);
     }
     else if (ccl::global_data::env().enable_ze_bidir_algo) {
-        size_t base_count = count;
-        size_t pair_comm_offset = 0;
-        size_t pair_comm_offset_bytes = 0;
-
-        base_count = count / pair_comm->size();
-        pair_comm_offset = base_count * pair_comm->rank();
-        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
-
-        if (pair_comm->rank() == pair_comm->size() - 1)
-            base_count += count % pair_comm->size();
-
-        size_t main_block_count = base_count / even_comm_size;
-        size_t block_count = main_block_count;
-        if (even_comm->rank() == even_comm_size - 1) {
-            block_count += base_count % even_comm_size;
-        }
-        size_t even_comm_offset_bytes = main_block_count * even_comm->rank() * dtype.size();
         ccl_buffer pair_comm_send_buf = send_buf + pair_comm_offset_bytes;
         ccl_buffer pair_comm_recv_buf = tmp_buf;
         ccl_buffer even_comm_recv_buf = tmp_buf + even_comm_offset_bytes;
@@ -774,14 +817,15 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
 
         if (!is_single_node) {
             bool one_tmp_buf = use_tmp_buf && !use_read_write_pipeline;
-            ccl_coll_entry_param coll_param{ .ctype = ccl_coll_reduce,
-                                             .send_buf = even_comm_recv_buf,
-                                             .recv_buf = even_comm_recv_buf,
-                                             .count = block_count,
-                                             .dtype = dtype,
-                                             .reduction = op,
-                                             .root = root_node_idx,
-                                             .comm = r2r_comm };
+            ccl_coll_param coll_param{ false };
+            coll_param.ctype = ccl_coll_reduce;
+            coll_param.send_buf = even_comm_recv_buf;
+            coll_param.recv_buf = even_comm_recv_buf;
+            coll_param.count = block_count;
+            coll_param.dtype = dtype;
+            coll_param.reduction = op;
+            coll_param.root = root_node_idx;
+            coll_param.comm = r2r_comm;
 
             out_event = nullptr;
             ccl::add_scaleout(sched,
@@ -810,8 +854,13 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                 node_comm->rank() == root_in_node_comm ? recv_buf : ccl_buffer(),
                 block_count,
                 dtype,
-                copy_attr(
-                    root_in_node_comm, recv_buf_idx, copy_direction::d2d, node_comm, 0, offset),
+                copy_attr(root_in_node_comm,
+                          recv_buf_idx,
+                          copy_direction::d2d,
+                          false /*pt2pt_op*/,
+                          node_comm,
+                          0,
+                          offset),
                 wait_events);
             clear_and_push_back(wait_events, entry_copy->entry_event);
             ccl::add_comm_barrier(
@@ -841,21 +890,21 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
                                                                 wait_events);
             clear_and_push_back(wait_events, onesided_reduce_entry->entry_event);
 
-            size_t main_block_count = count / even_comm_size;
-            size_t block_count = main_block_count;
+            size_t main_block_count_1s = count / even_comm_size;
+            size_t block_count_1s = main_block_count_1s;
             if (even_comm->rank() == even_comm_size - 1) {
-                block_count += count % even_comm_size;
+                block_count_1s += count % even_comm_size;
             }
 
             ccl::add_comm_barrier(sched, even_comm, wait_events, out_event);
             clear_and_push_back(wait_events, out_event);
 
-            size_t offset_bytes = main_block_count * even_comm->rank() * dtype.size();
-            ccl_buffer partial_tmp_buf = tmp_buf + offset_bytes;
+            size_t offset_bytes_1s = main_block_count_1s * even_comm->rank() * dtype.size();
+            ccl_buffer partial_tmp_buf = tmp_buf + offset_bytes_1s;
             LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_entry");
 
-            std::vector<size_t> block_counts(even_comm->size(), main_block_count);
-            block_counts[even_comm->size() - 1] = block_count;
+            std::vector<size_t> block_counts(even_comm->size(), main_block_count_1s);
+            block_counts[even_comm->size() - 1] = block_count_1s;
             auto reduce_scatter_entry =
                 entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
                                                                    tmp_buf,
@@ -873,23 +922,24 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
             CCL_THROW_IF_NOT(comm->size() % node_comm_size == 0);
             int root_node_idx = root / node_comm_size;
 
-            ccl_coll_entry_param coll_param{ .ctype = ccl_coll_reduce,
-                                             .send_buf = partial_tmp_buf,
-                                             .recv_buf = partial_tmp_buf,
-                                             .count = block_count,
-                                             .dtype = dtype,
-                                             .reduction = op,
-                                             .root = root_node_idx,
-                                             .comm = r2r_comm };
+            ccl_coll_param coll_param{ false };
+            coll_param.ctype = ccl_coll_reduce, coll_param.send_buf = partial_tmp_buf;
+            coll_param.recv_buf = partial_tmp_buf;
+            coll_param.count = block_count_1s;
+            coll_param.dtype = dtype;
+            coll_param.reduction = op;
+            coll_param.root = root_node_idx;
+            coll_param.comm = r2r_comm;
 
             copy_attr h2d_copy_attr{};
             if (root_node_idx == r2r_comm->rank()) {
                 LOG_DEBUG("topo/scale_up/intra: use ze_onesided_bcast");
                 int root_in_node_comm = node_comm->get_rank_from_global(root);
-                size_t offset_count = offset_bytes / dtype.size();
+                size_t offset_count = offset_bytes_1s / dtype.size();
                 copy_attr local_attr(root_in_node_comm,
                                      recv_buf_idx,
                                      copy_direction::h2d,
+                                     false, /*pt2pt_op*/
                                      node_comm,
                                      0,
                                      offset_count);
@@ -916,7 +966,7 @@ ccl::status ccl_coll_build_topo_reduce(ccl_sched* sched,
     CCL_ASSERT(wait_events.size() == 1 && wait_events.back() != nullptr,
                "wait_events should have a single, valid event");
 
-    entry_factory::create<execute_cmdlists_entry>(sched);
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
 
     return ccl::status::success;
 }
diff --git a/src/coll/algorithms/reduce_scatter.cpp b/src/coll/algorithms/reduce_scatter.cpp
index 096ec6e7a..498be0a38 100644
--- a/src/coll/algorithms/reduce_scatter.cpp
+++ b/src/coll/algorithms/reduce_scatter.cpp
@@ -35,6 +35,9 @@ ccl::status ccl_coll_build_direct_reduce_scatter(ccl_sched* sched,
                                                  ccl_comm* comm) {
     LOG_DEBUG("build direct reduce_scatter");
 
+    if (recv_count == 0)
+        return ccl::status::success;
+
     entry_factory::create<reduce_scatter_entry>(
         sched, send_buf, recv_buf, recv_count, dtype, reduction, comm);
     return ccl::status::success;
@@ -47,6 +50,10 @@ ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
                                                      const ccl_datatype& dtype,
                                                      ccl::reduction op,
                                                      ccl_comm* comm) {
+    if (recv_count == 0) {
+        return ccl::status::success;
+    }
+
     CCL_THROW_IF_NOT(sched && send_buf && recv_buf,
                      "incorrect values, sched ",
                      sched,
@@ -67,10 +74,6 @@ ccl::status ccl_coll_build_ring_reduce_scatter_block(ccl_sched* sched,
     comm_size = comm->size();
     rank = comm->rank();
 
-    if (recv_count == 0) {
-        return ccl::status::success;
-    }
-
     if (!inplace) {
         /* copy local data into recv_buf */
         entry_factory::create<copy_entry>(
@@ -157,6 +160,10 @@ ccl::status ccl_coll_build_ring_reduce_scatter(ccl_sched* sched,
     size_t count = recv_count;
     size_t bytes = count * dtype_size;
 
+    if (comm_size == 0) {
+        return status;
+    }
+
     size_t chunk_count =
         (bytes >= ccl::global_data::env().rs_min_chunk_size &&
          count >= ccl::global_data::env().rs_chunk_count && (int)count >= comm_size)
@@ -355,6 +362,9 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                                                ccl::reduction op,
                                                ccl_comm* comm) {
     LOG_DEBUG("build topo reduce_scatter, recv_count ", recv_count);
+    if (recv_count == 0) {
+        return ccl::status::success;
+    }
 
     ccl_comm* pair_comm = comm->get_pair_comm().get();
     ccl_comm* even_comm = comm->get_even_comm().get();
@@ -365,6 +375,7 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     const int pair_comm_size = pair_comm->size();
     const int even_comm_size = even_comm->size();
     const int node_comm_size = node_comm->size();
+    const int r2r_comm_size = r2r_comm->size();
 
     const ccl::topo_manager& topo_manager = comm->get_topo_manager();
     const bool is_single_node = topo_manager.is_single_node;
@@ -381,15 +392,15 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     };
 
     // TODO: fix - reduce_scatter pipeline uses xelink write which seems to fail with int8
+    // TODO: enable pipeline for scaleout
     const bool use_reduce_scatter_pipeline =
         ccl::global_data::env().reduce_scatter_monolithic_pipeline_kernel && pair_comm_size > 1 &&
-        dtype != ccl::datatype::int8;
+        dtype != ccl::datatype::int8 && is_single_node;
     LOG_DEBUG("topo/reduce_scatter pipeline ", use_reduce_scatter_pipeline);
 
-    // optimized non-fallback algorithm is currently supported for scaleup and is bidirectional
+    // optimized non-fallback algorithm is currently supported for bidirectional case
     const bool use_non_fallback_algo = !ccl::global_data::env().reduce_scatter_fallback_algo &&
-                                       ccl::global_data::env().enable_ze_bidir_algo &&
-                                       is_single_node;
+                                       ccl::global_data::env().enable_ze_bidir_algo;
     LOG_DEBUG("topo/reduce_scatter fallback algo ", !use_non_fallback_algo);
 
     const bool is_inplace = send_buf == recv_buf;
@@ -400,38 +411,95 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     const bool use_tmp_buf = !is_single_node || !use_non_fallback_algo || !is_single_card ||
                              pair_comm_size > 1 || is_inplace;
 
+    bool is_rs_write = !ccl::global_data::env().reduce_scatter_topo_read &&
+                       !use_reduce_scatter_pipeline && use_non_fallback_algo &&
+                       dtype != ccl::datatype::int8;
+    LOG_DEBUG("topo/reduce_scatter is_rs_write:",
+              is_rs_write,
+              ccl::global_data::env().reduce_scatter_topo_read,
+              use_reduce_scatter_pipeline,
+              use_non_fallback_algo,
+              dtype != ccl::datatype::int8);
+    size_t base_count = count;
+    size_t pair_comm_offset = 0;
+    size_t pair_comm_offset_bytes = 0;
+
+    if (ccl::global_data::env().enable_ze_bidir_algo) {
+        base_count = count / pair_comm->size();
+        pair_comm_offset = base_count * pair_comm->rank();
+        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
+
+        if (pair_comm->rank() == pair_comm->size() - 1)
+            base_count += count % pair_comm->size();
+    }
+
+    size_t main_block_count = base_count / even_comm_size;
+    size_t block_count = main_block_count;
+    if (even_comm->rank() == even_comm_size - 1) {
+        block_count += base_count % even_comm_size;
+    }
+
+    // setup tmp buffer for write copy mode
+    ccl_buffer tmp_write_buf;
+
+    if (is_rs_write) {
+        size_t tmp_buf_bytes = 0;
+        tmp_buf_bytes = dtype.size() * ((even_comm_size - 1) * block_count);
+        // workaround with dummy 1 byte to avoid allocation with 0 byte
+        if (tmp_buf_bytes == 0) {
+            tmp_buf_bytes = 1;
+        }
+        ccl::alloc_param alloc_param(
+            tmp_buf_bytes, ccl::buffer_type::ze, ccl::buffer_place::device);
+        tmp_write_buf = sched->alloc_buffer(alloc_param);
+        LOG_DEBUG("topo/reduce_scatter: allocate temp write buffer");
+    }
+
+    size_t tmp_write_buf_idx = -1;
+    if (is_rs_write) {
+        in_buffers.push_back({ tmp_write_buf.get_ptr(), ccl::ze::ipc_mem_type::memory });
+        tmp_write_buf_idx = in_buffers.size() - 1;
+    }
     std::vector<ccl_buffer> tmp_bufs;
     size_t tmp_buf_idx_start = -1;
+    ccl_buffer tmp_buf;
     // reduce_scatter pipeline entry require distinct temp buffers for each peer
     if (use_non_fallback_algo && use_reduce_scatter_pipeline) {
         ze_utils::alloc_tmp_bufs(
             sched, comm, tmp_bufs, in_buffers, tmp_buf_idx_start, count, dtype);
     }
     else if (use_tmp_buf) {
-        const size_t recv_bytes = recv_count * dtype.size();
-        size_t tmp_buf_bytes = comm_size * recv_bytes;
-        // non fallback algo only needs temp data for a plane
-        if (use_non_fallback_algo) {
+        size_t tmp_buf_bytes = count * dtype.size();
+        // single-node non-fallback algo only needs temp buffer for a plane
+        if (use_non_fallback_algo && is_single_node) {
             tmp_buf_bytes /= pair_comm_size;
         }
         ccl::alloc_param alloc_param(
             tmp_buf_bytes, ccl::buffer_type::ze, ccl::buffer_place::device);
-        tmp_bufs.push_back(sched->alloc_buffer(alloc_param));
+        tmp_buf = sched->alloc_buffer(alloc_param);
+
+        const size_t tmp_buf_size_per_rank = recv_count * r2r_comm_size * dtype.size();
+        if (use_non_fallback_algo && !is_single_node) {
+            // plane 0 works on even partitions and plane 1 works on odd partitions of tmp_buf
+            tmp_bufs.push_back(tmp_buf + tmp_buf_size_per_rank * pair_comm->rank());
+
+            // scaleout rearranges send_buf into tmp_buf and uses this rearranged buf as input
+            in_buffers[0] = { tmp_buf.get_ptr(), ccl::ze::ipc_mem_type::memory }; // 0
+        }
+        else {
+            tmp_bufs.push_back(tmp_buf);
+        }
+        tmp_buf_idx_start = in_buffers.size();
         in_buffers.push_back({ tmp_bufs.front().get_ptr(), ccl::ze::ipc_mem_type::memory }); // 2
-        tmp_buf_idx_start = 2;
 
-        // divide a single large buffer across peers
-        if (use_non_fallback_algo) {
-            for (int i = 0; i < even_comm_size - 1; i++) {
-                tmp_bufs.push_back(tmp_bufs.back() + recv_bytes);
-            }
+        // for scaleout, plane 0 works on even partitions and plane 1 works on odd partitions
+        // of tmp_buf and therefore we need to skip two partitions to reach the next one.
+        const size_t skip_multiplier = is_single_node ? 1 : pair_comm_size;
+        for (int i = 0; i < even_comm_size - 1; i++) {
+            tmp_bufs.push_back(tmp_bufs.back() + tmp_buf_size_per_rank * skip_multiplier);
         }
     }
 
-    // note: start section common with allreduce topo
-    // the following section is based on the allreduce topo implementation
-    // it uses steps from allreduce topo to collect allreduce results on tmp buffer
-
     size_t ipc_event_count{};
     size_t max_ipc_event_count{ 6 };
     ze_event_pool_handle_t ipc_event_pool{};
@@ -455,31 +523,71 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                              skip_rank,
                              ipc_event_pool,
                              ipc_event_count++);
-    clear_and_push_back(wait_events, out_event);
 
     CCL_THROW_IF_NOT(comm_size % 2 == 0, "unexpected comm_size ", comm_size);
     CCL_THROW_IF_NOT(node_comm_size % 2 == 0, "unexpected node_comm_size ", node_comm_size);
 
     if (use_non_fallback_algo) {
-        if (is_single_card) {
+        if (!is_single_node) {
+            // rearrange data from send_buf to tmp_buf
+            // this is essentially a transpose of input data visualized as
+            // node_comm_size X r2r_comm_size to r2r_comm_size X node_comm_size
+            std::vector<ze_event_handle_t> parallel_copy_events;
+            for (int node_comm_idx = 0; node_comm_idx < node_comm_size; node_comm_idx++) {
+                for (int r2r_comm_idx = 0; r2r_comm_idx < r2r_comm_size; r2r_comm_idx++) {
+                    const size_t recv_bytes = recv_count * dtype.size();
+                    const size_t dst_offset =
+                        (node_comm_idx * r2r_comm_size + r2r_comm_idx) * recv_bytes;
+                    const size_t src_offset =
+                        (r2r_comm_idx * node_comm_size + node_comm_idx) * recv_bytes;
+                    copy_attr attr{};
+                    attr.direction = copy_direction::d2d;
+                    //TODO: make offset calculation more general and robust
+                    auto entry = entry_factory::create<ze_copy_entry>(sched,
+                                                                      send_buf + src_offset,
+                                                                      tmp_buf + dst_offset,
+                                                                      recv_count,
+                                                                      dtype,
+                                                                      attr,
+                                                                      wait_events);
+                    parallel_copy_events.push_back(entry->entry_event);
+                }
+            }
+
+            // make sure all ranks have finished rearranging send_buf
+            ccl::add_comm_barrier(sched, node_comm, parallel_copy_events, out_event);
+            clear_and_push_back(wait_events, out_event);
+        }
+
+        // ze_a2a_pipeline_read_write_entry only works for even_comm_size > 1 and
+        // therefore we need to deal with even_comm_size == 1 separately
+        if (even_comm_size == 1) {
             LOG_DEBUG("topo/scale_up/intra: use ze_onesided_reduce");
-            const size_t pair_comm_offset = recv_count * pair_comm->rank();
-            const ccl_buffer pair_comm_send_buf = send_buf + pair_comm_offset * dtype.size();
-            // for inplace, write to temp buffer instead of recv_buf since
+            const size_t pair_comm_count = recv_count * r2r_comm_size;
+            const size_t pair_comm_local_offset = pair_comm_count * pair_comm->rank();
+            // for single-node use send_buf and for multi-node use rearranged tmp_buf
+            const ccl_buffer pair_comm_send_buf =
+                is_single_node ? send_buf + pair_comm_local_offset * dtype.size()
+                               : tmp_buf + pair_comm_local_offset * dtype.size();
+
+            // for inplace, write to tmp buffer instead of recv_buf since
             // the other rank is reading from send_buf which is same as recv_buf
+            // for multi-node, write to tmp_buf that will be later used for scaleout
             auto entry = entry_factory::create<ze_onesided_reduce_entry>(
                 sched,
                 pair_comm_send_buf,
-                is_inplace ? *tmp_bufs.begin() : recv_buf,
-                recv_count,
+                is_inplace || !is_single_node ? *tmp_bufs.begin() : recv_buf,
+                pair_comm_count,
                 dtype,
                 op,
                 pair_comm->rank(),
                 pair_comm,
                 wait_events,
-                pair_comm_offset);
+                pair_comm_local_offset);
             clear_and_push_back(wait_events, entry->entry_event);
-            if (is_inplace) {
+
+            // for single-node inplace, copy output from tmp_buf to recv_buf
+            if (is_inplace && is_single_node) {
                 ccl::add_comm_barrier(sched, pair_comm, wait_events, out_event);
                 clear_and_push_back(wait_events, out_event);
                 copy_attr attr{};
@@ -488,11 +596,8 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                     sched, *tmp_bufs.begin(), recv_buf, recv_count, dtype, attr, wait_events);
                 clear_and_push_back(wait_events, copy_entry->entry_event);
             }
-            entry_factory::create<execute_cmdlists_entry>(sched);
-            return ccl::status::success;
         }
-
-        if (pair_comm_size > 1) {
+        else if (pair_comm_size > 1) {
             LOG_DEBUG("topo/scale_up/intra: use ze_a2a_pipeline_read_write_entry");
 
             // allreduce and reduce divides whole data into two continous chunks,
@@ -502,71 +607,139 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
             ze_a2a_pipeline_read_write_entry::attr attrs{ .use_continous_data = false,
                                                           .use_remote_target =
                                                               use_reduce_scatter_pipeline };
-            auto entry = entry_factory::create<ze_a2a_pipeline_read_write_entry>(sched,
-                                                                                 comm,
-                                                                                 send_buf,
-                                                                                 tmp_bufs,
-                                                                                 tmp_buf_idx_start,
-                                                                                 count,
-                                                                                 dtype,
-                                                                                 op,
-                                                                                 wait_events,
-                                                                                 attrs);
+            auto entry = entry_factory::create<ze_a2a_pipeline_read_write_entry>(
+                sched,
+                comm,
+                is_single_node ? send_buf : tmp_buf,
+                tmp_bufs,
+                tmp_buf_idx_start,
+                count,
+                dtype,
+                op,
+                wait_events,
+                attrs);
             clear_and_push_back(wait_events, entry->entry_event);
-            // TODO: can even_comm be used instead of node_comm
+
             ccl::add_comm_barrier(sched, node_comm, wait_events, out_event);
             clear_and_push_back(wait_events, out_event);
         }
 
-        if (use_reduce_scatter_pipeline) {
+        ccl_buffer out_tmp_buf = *tmp_bufs.begin();
+        if (even_comm_size > 1 && use_reduce_scatter_pipeline) {
             LOG_DEBUG("topo/scale_up/intra: use ze_a2a_pipeline_reduce_entry");
             // readuce from local buffer
             auto entry = entry_factory::create<ze_a2a_pipeline_reduce_entry>(
                 sched, comm, recv_buf, tmp_bufs, count, dtype, op, wait_events);
             clear_and_push_back(wait_events, entry->entry_event);
         }
-        else {
-            LOG_DEBUG("topo/scale_up/intra: use ze_a2a_reduce_scatter_entry");
+        else if (even_comm_size > 1) {
+            const size_t tmp_buf_count_per_rank = recv_count * r2r_comm_size;
+            const size_t tmp_buf_size_per_rank = tmp_buf_count_per_rank * dtype.size();
+            if (!is_single_node) {
+                // plane 0 worked on even partitions and plane 1
+                // worked on odd partitions, but we need continous
+                // data for ze_a2a_reduce_scatter_entry and therefore
+                // we pack alternate paritions into continous data
+                std::vector<ze_event_handle_t> parallel_copy_events;
+                for (int even_comm_idx = 1; even_comm_idx < even_comm_size; even_comm_idx++) {
+                    copy_attr attr{};
+                    attr.direction = copy_direction::d2d;
+                    auto entry = entry_factory::create<ze_copy_entry>(
+                        sched,
+                        tmp_bufs[even_comm_idx],
+                        tmp_bufs.front() + even_comm_idx * tmp_buf_size_per_rank,
+                        tmp_buf_count_per_rank,
+                        dtype,
+                        attr,
+                        wait_events);
+                    parallel_copy_events.push_back(entry->entry_event);
+                }
+                ccl::add_comm_barrier(sched, even_comm, parallel_copy_events, out_event);
+                clear_and_push_back(wait_events, out_event);
+            }
+
             // perform xelink read and followed by reduce
-            ccl_buffer tmp_send_buf = send_buf;
+            ccl_buffer src_send_buf = is_single_node ? send_buf : tmp_buf;
             size_t send_buf_idx = 0;
             // when both tiles are used, we need to read from temporary
             // which contains the result of mdfi reduce
             if (pair_comm_size > 1) {
-                tmp_send_buf = tmp_bufs.front();
+                src_send_buf = tmp_bufs.front();
                 send_buf_idx = tmp_buf_idx_start;
             }
-            std::vector<size_t> block_counts(even_comm_size, recv_count);
-            auto entry = entry_factory::create<ze_a2a_reduce_scatter_entry>(sched,
-                                                                            tmp_send_buf,
-                                                                            recv_buf,
-                                                                            block_counts.data(),
-                                                                            dtype,
-                                                                            op,
-                                                                            even_comm,
-                                                                            wait_events,
-                                                                            send_buf_idx,
-                                                                            0); // pair_comm_offset
-            clear_and_push_back(wait_events, entry->entry_event);
-        }
-        entry_factory::create<execute_cmdlists_entry>(sched);
-        return ccl::status::success;
-    }
+            std::vector<size_t> block_counts(even_comm_size, recv_count * r2r_comm_size);
+
+            if (!is_rs_write) {
+                LOG_DEBUG("topo/scale_up/intra: use ze_a2a_reduce_scatter_entry");
+                out_tmp_buf = src_send_buf + even_comm->rank() * tmp_buf_size_per_rank;
+                auto entry = entry_factory::create<ze_a2a_reduce_scatter_entry>(
+                    sched,
+                    src_send_buf,
+                    is_single_node ? recv_buf : out_tmp_buf,
+                    block_counts.data(),
+                    dtype,
+                    op,
+                    even_comm,
+                    wait_events,
+                    send_buf_idx,
+                    0); // pair_comm_offset
+                clear_and_push_back(wait_events, entry->entry_event);
+            }
+            else {
+                // copy using write
+                LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_write_copy_entry");
+                reduce_scatter_args rs_args = { even_comm, block_counts, dtype, op };
+                reduce_scatter_bufs rs_bufs = { src_send_buf,
+                                                is_single_node ? recv_buf : out_tmp_buf,
+                                                tmp_write_buf,
+                                                tmp_write_buf_idx,
+                                                0 }; //pair_comm_offset
+
+                auto copy_entry = entry_factory::create<ze_a2a_reduce_scatter_write_copy_entry>(
+                    sched, rs_args, rs_bufs, wait_events);
+                clear_and_push_back(wait_events, copy_entry->entry_event);
+                ccl::add_comm_barrier(
+                    sched, even_comm, wait_events, out_event, ipc_event_pool, ipc_event_count++);
+                clear_and_push_back(wait_events, out_event);
 
-    size_t base_count = count;
-    size_t pair_comm_offset = 0;
-    size_t pair_comm_offset_bytes = 0;
+                LOG_DEBUG("topo/scale_up/inter: use ze_a2a_reduce_scatter_write_kernel_entry");
+                // local reduction
+                auto kernel_entry = entry_factory::create<ze_a2a_reduce_scatter_write_kernel_entry>(
+                    sched, rs_args, rs_bufs, wait_events);
+                clear_and_push_back(wait_events, kernel_entry->entry_event);
+                ccl::add_comm_barrier(
+                    sched, even_comm, wait_events, out_event, ipc_event_pool, ipc_event_count++);
+                clear_and_push_back(wait_events, out_event);
+            }
+        }
 
-    if (ccl::global_data::env().enable_ze_bidir_algo) {
-        base_count = count / pair_comm->size();
-        pair_comm_offset = base_count * pair_comm->rank();
-        pair_comm_offset_bytes = pair_comm_offset * dtype.size();
+        if (!is_single_node) {
+            ccl_coll_param coll_param{ false };
+            coll_param.ctype = ccl_coll_reduce_scatter;
+            coll_param.send_buf = out_tmp_buf;
+            coll_param.recv_buf = recv_buf;
+            coll_param.count = recv_count;
+            coll_param.dtype = dtype;
+            coll_param.reduction = op;
+            coll_param.comm = r2r_comm;
+            coll_param.hint_algo.reduce_scatter = ccl_coll_reduce_scatter_direct;
+
+            ccl::add_scaleout(sched, coll_param, is_single_node, wait_events, out_event);
+            if (out_event) {
+                clear_and_push_back(wait_events, out_event);
+            }
+        }
 
-        if (pair_comm->rank() == pair_comm->size() - 1)
-            base_count += count % pair_comm->size();
+        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+        return ccl::status::success;
     }
 
-    else if (pair_comm->rank() != ccl::global_data::env().kernel_1s_lead) {
+    // note: start section common with allreduce topo
+    // the following section is based on the allreduce topo implementation
+    // it uses steps from allreduce topo to collect allreduce results on tmp buffer
+
+    if (!ccl::global_data::env().enable_ze_bidir_algo &&
+        pair_comm->rank() != ccl::global_data::env().kernel_1s_lead) {
         ccl::add_comm_barrier(
             sched, pair_comm, wait_events, out_event, ipc_event_pool, ipc_event_count++);
 
@@ -575,15 +748,9 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                          ipc_event_count,
                          ", expected max ",
                          max_ipc_event_count);
-        entry_factory::create<execute_cmdlists_entry>(sched);
+        entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
         return ccl::status::success;
     }
-
-    size_t main_block_count = base_count / even_comm_size;
-    size_t block_count = main_block_count;
-    if (even_comm->rank() == even_comm_size - 1) {
-        block_count += base_count % even_comm_size;
-    }
     const ccl_buffer tmp_recv_buf = tmp_bufs.front();
     const size_t tmp_recv_buf_idx = tmp_buf_idx_start;
     size_t even_comm_offset_bytes = main_block_count * even_comm->rank() * dtype.size();
@@ -671,13 +838,14 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
         }
     }
 
-    ccl_coll_entry_param coll_param{ .ctype = ccl_coll_allreduce,
-                                     .send_buf = even_comm_recv_buf,
-                                     .recv_buf = even_comm_recv_buf,
-                                     .count = block_count,
-                                     .dtype = dtype,
-                                     .reduction = op,
-                                     .comm = r2r_comm };
+    ccl_coll_param coll_param{ false };
+    coll_param.ctype = ccl_coll_allreduce;
+    coll_param.send_buf = even_comm_recv_buf;
+    coll_param.recv_buf = even_comm_recv_buf;
+    coll_param.count = block_count;
+    coll_param.dtype = dtype;
+    coll_param.reduction = op;
+    coll_param.comm = r2r_comm;
 
     ccl::add_scaleout(sched, coll_param, is_single_node, wait_events, out_event);
     if (out_event) {
@@ -729,6 +897,7 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
                                                           copy_attr(peer_rank,
                                                                     tmp_recv_buf_idx,
                                                                     copy_direction::t2t,
+                                                                    false, /*pt2pt_op*/
                                                                     pair_comm,
                                                                     pair_comm_offset,
                                                                     pair_comm_offset),
@@ -766,7 +935,7 @@ ccl::status ccl_coll_build_topo_reduce_scatter(ccl_sched* sched,
     sched->add_barrier();
     ccl::add_comm_barrier(sched, node_comm, wait_events, out_event);
 
-    entry_factory::create<execute_cmdlists_entry>(sched);
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
     return ccl::status::success;
 }
 
diff --git a/src/coll/algorithms/send.cpp b/src/coll/algorithms/send.cpp
new file mode 100644
index 000000000..10865f443
--- /dev/null
+++ b/src/coll/algorithms/send.cpp
@@ -0,0 +1,121 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+/*
+*
+*  (C) 2001 by Argonne National Laboratory.
+*      See COPYRIGHT in top-level directory.
+*/
+
+#include "coll/algorithms/algorithms.hpp"
+#include "coll/coll_util.hpp"
+#include "common/utils/utils.hpp"
+#include "sched/entry/factory/entry_factory.hpp"
+
+ccl::status ccl_coll_build_direct_send(ccl_sched* sched,
+                                       ccl_buffer buf,
+                                       size_t count,
+                                       const ccl_datatype& dtype,
+                                       int peer_rank,
+                                       ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    LOG_DEBUG("build direct SEND: ", comm->rank(), ", count: ", count, ", peer_rank: ", peer_rank);
+    CCL_THROW_IF_NOT(peer_rank > CCL_INVALID_PEER_RANK_IDX && peer_rank < comm->size(),
+                     "invalid peer_rank: ",
+                     peer_rank);
+    entry_factory::create<send_entry>(sched, buf, count, dtype, peer_rank, comm);
+
+    return status;
+}
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+ccl::status ccl_coll_build_topo_send(ccl_sched* sched,
+                                     ccl_buffer buf,
+                                     size_t count,
+                                     const ccl_datatype& dtype,
+                                     int peer_rank,
+                                     ccl_comm* comm) {
+    ccl::status status = ccl::status::success;
+
+    ccl_comm* node_comm = comm->get_node_comm().get();
+    auto node_peer_rank = node_comm->get_rank_from_global(peer_rank);
+    auto node_curr_rank = node_comm->rank();
+
+    LOG_DEBUG("build topo SEND buf: ", buf.get_ptr(), " and peer_rank: ", node_peer_rank);
+    CCL_THROW_IF_NOT(
+        node_peer_rank > CCL_INVALID_PEER_RANK_IDX && node_peer_rank < node_comm->size(),
+        "invalid peer_rank: ",
+        node_peer_rank,
+        " for send op");
+
+    const std::vector<ze_handle_exchange_entry::mem_desc_t> buffer{
+        { buf.get_ptr(), ccl::ze::ipc_mem_type::memory }, // 0 idx
+    };
+
+    std::vector<ze_event_handle_t> wait_events{};
+    ze_event_handle_t out_event{};
+
+    ccl::utils::pt2pt_handle_exchange_info info{ node_peer_rank,
+                                                 ccl::utils::pt2pt_handle_exchange_role::sender };
+    if (!ccl::global_data::env().ze_pt2pt_read) {
+        info.role = ccl::utils::pt2pt_handle_exchange_role::receiver;
+    }
+
+    ccl::add_handle_exchange(sched,
+                             node_comm,
+                             wait_events,
+                             out_event,
+                             buffer,
+                             ccl_comm::invalid_rank /*skip_rank*/,
+                             nullptr,
+                             0,
+                             info);
+    LOG_DEBUG("build SEND: add_handle_exchange is done");
+
+    ccl_sched_id_t pt2pt_ack_tag = node_comm->get_atl_comm()->tag_creator->get_pt2pt_ack_tag();
+
+    if (!ccl::global_data::env().ze_pt2pt_read) {
+        LOG_DEBUG("build SEND: write mode is enabled");
+        entry_factory::create<copy_entry>(
+            sched,
+            buf,
+            ccl_buffer(),
+            count,
+            dtype,
+            copy_attr(node_peer_rank, 0, copy_direction::d2d, true /*pt2pt_op*/));
+        LOG_DEBUG("build SEND: copy_entry is done");
+
+        uint64_t ack_tag = node_comm->get_atl_comm()->tag_creator->create(
+            node_curr_rank, node_comm->get_comm_id(), pt2pt_ack_tag, sched->get_op_id());
+        ccl::utils::send_ack_to_peer(node_comm->get_atl_comm(), ack_tag, node_peer_rank);
+    }
+    else {
+        uint64_t ack_tag = node_comm->get_atl_comm()->tag_creator->create(
+            node_peer_rank, node_comm->get_comm_id(), pt2pt_ack_tag, sched->get_op_id());
+        ccl::utils::recv_ack_from_peer(node_comm->get_atl_comm(), ack_tag, node_peer_rank);
+        LOG_DEBUG("build SEND: recv_ack_from_peer is done with tag: ",
+                  ack_tag,
+                  ", comm_rank: ",
+                  comm->rank(),
+                  ", peer_rank: ",
+                  node_peer_rank);
+    }
+
+    entry_factory::create<ze_execute_cmdlists_on_init_entry>(sched);
+    return status;
+}
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
diff --git a/src/coll/coll.cpp b/src/coll/coll.cpp
index 3ef87e69a..aa443a09d 100644
--- a/src/coll/coll.cpp
+++ b/src/coll/coll.cpp
@@ -51,20 +51,47 @@
 
 #include "common/global/global.hpp"
 
-#include "coll/algorithms/algorithms.hpp"
 #include "coll/algorithms/algorithm_utils.hpp"
+#include "coll/algorithms/algorithms.hpp"
 #include "coll/selection/selection.hpp"
 #include "exec/exec.hpp"
 #include "fusion/fusion.hpp"
-#include "unordered_coll/unordered_coll.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
+#include "sched/sched_timer.hpp"
+#include "unordered_coll/unordered_coll.hpp"
+
+ccl_request* allgatherv_case(ccl_coll_param& param) {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    std::vector<sycl::event> events{};
+    for (size_t idx = 0; idx < param.deps.size(); idx++) {
+        events.push_back(param.deps[idx].get_native());
+    }
+    sycl::event ev;
+#if ICPX_VERSION >= 140000
+    ev = param.stream->get_native_stream().ext_oneapi_submit_barrier(events);
+#elif ICPX_VERSION < 140000
+    ev = param.stream->get_native_stream().submit_barrier(events);
+#endif // ICPX_VERSION
+    if (ccl::utils::should_use_sycl_output_event(param.stream)) {
+        ccl_coll_param dummy_param{};
+        dummy_param.comm = param.comm;
+        auto dummy_sched = ccl_sched::create(dummy_param, {});
+        auto req = dummy_sched->get_request();
+        req->set_native_event(ev);
+        return req;
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    return nullptr;
+}
 
 /* param is not const because param.comm can be updated for unordered colls */
 static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr& in_attr) {
     ccl_coll_attr& attr = const_cast<ccl_coll_attr&>(in_attr);
 
 #ifdef CCL_ENABLE_ITT
-    ccl::profile::itt::task_start(ccl::profile::itt::task_type::api_call);
+    __itt_event coll_create_itt_event =
+        ccl::profile::itt::event_get(ccl_coll_type_to_str(param.ctype));
+    ccl::profile::itt::event_start(coll_create_itt_event);
 #endif // CCL_ENABLE_ITT
 
 #ifdef CCL_ENABLE_SYCL
@@ -98,6 +125,10 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
 
     ccl::global_data& data = ccl::global_data::get();
 
+    if (param.ctype == ccl_coll_allgatherv && param.comm->size() == 1 && param.is_inplace()) {
+        return allgatherv_case(param);
+    }
+
     /* 1. decide whether schedule should be postponed (this includes caching and starting) */
     bool postpone_schedule = false;
     if (ccl::global_data::env().enable_unordered_coll) {
@@ -125,11 +156,6 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     /* 2. create or get schedule */
     ccl_sched* sched = ccl_sched::create(param, attr);
 
-    // TODO: Bug: this should be set by restart_manager, otherwise there's a race condition.
-    //       Also, this code doesn't belong here anyway.
-    //       There has to be a copy constructor for ccl_coll_param; move this code there.
-    (sched->coll_param).peer_rank = param.peer_rank;
-
     /* 3. fuse schedule */
     if (!postpone_schedule &&
         ccl::global_data::env().enable_fusion
@@ -166,7 +192,11 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
     ccl_request* request = sched->start(data.executor.get());
     if (sched->coll_attr.synchronous) {
         request->synchronous = true;
-        ccl_wait_impl<ccl_sched>(data.executor.get(), request);
+        // request->synchronous is true,
+        // so ccl_wait_impl should not release the `request`
+        auto wait_result = ccl_wait_impl<ccl_sched>(data.executor.get(), request);
+        CCL_THROW_IF_NOT(wait_result != ccl_wait_result_completed_released,
+                         "internal error, valid request was released");
     }
 #ifdef CCL_ENABLE_SYCL
     else if (ccl::utils::should_use_sycl_output_event(sched->coll_param.stream)) {
@@ -194,7 +224,7 @@ static ccl_request* ccl_coll_create(ccl_coll_param& param, const ccl_coll_attr&
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_ITT
-    ccl::profile::itt::task_end(ccl::profile::itt::task_type::api_call);
+    ccl::profile::itt::event_end(coll_create_itt_event);
 #endif // CCL_ENABLE_ITT
 
     return request;
@@ -229,15 +259,13 @@ ccl::status ccl_coll_build_allgatherv(ccl_sched* sched,
 
     auto get_coll_param = [&]() {
         ccl_coll_param coll_param{};
-        coll_param.ctype = ccl_coll_allgatherv;
-        coll_param.send_bufs.push_back(send_buf.get_ptr());
-        coll_param.send_counts.push_back(send_count);
-        coll_param.recv_bufs.push_back(recv_buf.get_ptr());
-        coll_param.recv_counts.reserve(comm->size());
+        coll_param.ctype = ccl_coll_allgatherv, coll_param.send_bufs.push_back(send_buf.get_ptr()),
+        coll_param.send_counts.push_back(send_count),
+        coll_param.recv_bufs.push_back(recv_buf.get_ptr()),
+        coll_param.recv_counts.reserve(comm->size()),
         coll_param.recv_counts.insert(
-            coll_param.recv_counts.end(), recv_counts, recv_counts + comm->size());
-        coll_param.dtype = dtype;
-        coll_param.comm = comm;
+            coll_param.recv_counts.end(), recv_counts, recv_counts + comm->size()),
+        coll_param.dtype = dtype, coll_param.comm = comm,
         coll_param.stream = sched->coll_param.stream;
         return coll_param;
     };
@@ -586,6 +614,7 @@ ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
                                           const ccl_datatype& dtype,
                                           ccl::reduction reduction,
                                           ccl_comm* comm,
+                                          bool is_scaleout,
                                           bool from_allreduce) {
     ccl::status status = ccl::status::success;
 
@@ -600,6 +629,7 @@ ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
     param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
 #endif // CCL_ENABLE_SYCL
     param.hint_algo = sched->hint_algo;
+    param.is_scaleout = is_scaleout;
 
     auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_reduce_scatter>(param);
 
@@ -642,12 +672,37 @@ ccl::status ccl_coll_build_recv(ccl_sched* sched,
                                 ccl_comm* comm) {
     ccl::status status = ccl::status::success;
 
-    LOG_DEBUG("build recv ", comm->rank(), " count: ", count, ", peer_rank: ", peer_rank);
-    CCL_THROW_IF_NOT(peer_rank > CCL_INVALID_PEER_RANK_IDX && peer_rank < comm->size(),
-                     "invalid peer_rank: ",
-                     peer_rank);
+    ccl_selector_param param;
     sched->coll_param.ctype = ccl_coll_recv;
-    entry_factory::create<recv_entry>(sched, buf, count, dtype, peer_rank, comm);
+    param.ctype = sched->coll_param.ctype;
+    param.count = count;
+    param.dtype = dtype;
+    param.comm = comm;
+    param.stream = sched->coll_param.stream;
+    param.buf = buf.get_ptr();
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
+    param.peer_rank = peer_rank;
+    param.hint_algo = sched->hint_algo;
+
+    auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_recv>(param);
+
+    switch (algo) {
+        case ccl_coll_recv_direct:
+        case ccl_coll_recv_offload:
+            CCL_CALL(ccl_coll_build_direct_recv(sched, buf, count, dtype, peer_rank, comm));
+            break;
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_recv_topo:
+            CCL_CALL(ccl_coll_build_topo_recv(sched, buf, count, dtype, peer_rank, comm));
+            break;
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+        default:
+            CCL_FATAL("unexpected recv_algo ", ccl_coll_algorithm_to_str(algo));
+            return ccl::status::invalid_arguments;
+    }
 
     return status;
 }
@@ -660,12 +715,37 @@ ccl::status ccl_coll_build_send(ccl_sched* sched,
                                 ccl_comm* comm) {
     ccl::status status = ccl::status::success;
 
-    LOG_DEBUG("build send: ", comm->rank(), ", count: ", count, ", peer_rank: ", peer_rank);
-    CCL_THROW_IF_NOT(peer_rank > CCL_INVALID_PEER_RANK_IDX && peer_rank < comm->size(),
-                     "invalid peer_rank: ",
-                     peer_rank);
+    ccl_selector_param param;
     sched->coll_param.ctype = ccl_coll_send;
-    entry_factory::create<send_entry>(sched, buf, count, dtype, peer_rank, comm);
+    param.ctype = sched->coll_param.ctype;
+    param.count = count;
+    param.dtype = dtype;
+    param.comm = comm;
+    param.stream = sched->coll_param.stream;
+    param.buf = buf.get_ptr();
+#ifdef CCL_ENABLE_SYCL
+    param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
+#endif // CCL_ENABLE_SYCL
+    param.peer_rank = peer_rank;
+    param.hint_algo = sched->hint_algo;
+
+    auto algo = ccl::global_data::get().algorithm_selector->get<ccl_coll_send>(param);
+
+    switch (algo) {
+        case ccl_coll_send_direct:
+        case ccl_coll_send_offload:
+            CCL_CALL(ccl_coll_build_direct_send(sched, buf, count, dtype, peer_rank, comm));
+            break;
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        case ccl_coll_send_topo:
+            CCL_CALL(ccl_coll_build_topo_send(sched, buf, count, dtype, peer_rank, comm));
+            break;
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+        default:
+            CCL_FATAL("unexpected send_algo ", ccl_coll_algorithm_to_str(algo));
+            return ccl::status::invalid_arguments;
+    }
 
     return status;
 }
@@ -745,6 +825,12 @@ ccl_request* ccl_barrier_impl(ccl_comm* comm,
     ccl_coll_attr attr{};
     attr.synchronous = 1;
 
+#ifdef CCL_ENABLE_SYCL
+    if (!ccl::global_data::env().sync_barrier && ccl::is_queue_in_order(stream)) {
+        attr.synchronous = 0;
+    }
+#endif // CCL_ENABLE_SYCL
+
     auto req = ccl_coll_create(param, attr);
 
     LOG_DEBUG("coll ", ccl_coll_type_to_str(param.ctype), " created, req ", req)
diff --git a/src/coll/coll.hpp b/src/coll/coll.hpp
index cc4d7382e..de9192d36 100644
--- a/src/coll/coll.hpp
+++ b/src/coll/coll.hpp
@@ -91,6 +91,7 @@ ccl::status ccl_coll_build_reduce_scatter(ccl_sched* sched,
                                           const ccl_datatype& dtype,
                                           ccl::reduction reduction,
                                           ccl_comm* comm,
+                                          bool is_scaleout,
                                           bool from_allreduce = false);
 
 ccl::status ccl_coll_build_recv(ccl_sched* sched,
diff --git a/src/coll/coll_param.cpp b/src/coll/coll_param.cpp
index 213d405a5..b0b931790 100644
--- a/src/coll/coll_param.cpp
+++ b/src/coll/coll_param.cpp
@@ -96,33 +96,48 @@ std::string ccl_coll_attr::to_string() const {
     return ss.str();
 }
 
-ccl_coll_param::ccl_coll_param() {
-    ctype = ccl_coll_last_value;
-    send_bufs.reserve(1);
-    recv_bufs.reserve(1);
-    send_counts.reserve(1);
-    recv_counts.reserve(1);
-    stream = nullptr;
-    comm = nullptr;
-    is_scaleout = false;
-}
 void ccl_coll_param::copy(const ccl_coll_param& other) {
     ctype = other.ctype;
+    hint_algo = other.hint_algo;
+    send_buf = other.send_buf;
+    recv_buf = other.recv_buf;
     send_bufs = other.send_bufs;
     recv_bufs = other.recv_bufs;
-    device_send_bufs = other.device_send_bufs;
-    device_recv_bufs = other.device_recv_bufs;
+    send_dev_bufs = other.send_dev_bufs;
+    recv_dev_bufs = other.recv_dev_bufs;
     send_counts = other.send_counts;
     recv_counts = other.recv_counts;
+    send_count = other.send_count;
+    count = other.count;
     dtype = other.dtype;
     reduction = other.reduction;
     root = other.root;
     comm = other.comm;
     stream = other.stream;
+    peer_rank = other.peer_rank;
     is_scaleout = other.is_scaleout;
+    is_validate = other.is_validate;
+    is_pt2pt = other.is_pt2pt;
     copy_deps(other.deps);
-    validate();
+
+    if (is_validate) {
+        validate();
+    }
 }
+
+ccl_coll_param::ccl_coll_param(bool in_is_validate) {
+    ctype = ccl_coll_last_value;
+    send_bufs.reserve(1);
+    recv_bufs.reserve(1);
+    send_counts.reserve(1);
+    recv_counts.reserve(1);
+    stream = nullptr;
+    comm = nullptr;
+    is_scaleout = false;
+    is_validate = in_is_validate;
+    is_pt2pt = false;
+}
+
 ccl_coll_param::ccl_coll_param(const ccl_coll_param& other) {
     copy(other);
 }
@@ -134,13 +149,15 @@ std::string ccl_coll_param::to_string() const {
     ss << "coll: " << ccl_coll_type_to_str(ctype);
 
     if (!send_bufs.empty()) {
-        ss << ", sb: " << get_send_buf()
-           << ", sc: " << std::accumulate(send_counts.begin(), send_counts.end(), 0);
+        ss << ", sb: " << get_send_buf() << ", sc: "
+           << std::accumulate(
+                  send_counts.begin(), send_counts.end(), ccl::utils::initial_count_value);
     }
 
     if (!recv_bufs.empty()) {
-        ss << ", rb: " << get_recv_buf()
-           << ", rc: " << std::accumulate(recv_counts.begin(), recv_counts.end(), 0);
+        ss << ", rb: " << get_recv_buf() << ", rc: "
+           << std::accumulate(
+                  recv_counts.begin(), recv_counts.end(), ccl::utils::initial_count_value);
     }
 
     if (ctype != ccl_coll_barrier) {
@@ -176,33 +193,42 @@ std::string ccl_coll_param::to_string() const {
 }
 
 void* ccl_coll_param::get_send_buf(size_t idx, ccl_coll_param::buf_type type) const {
-    auto& vec = (type == ccl_coll_param::buf_type::regular) ? send_bufs : device_send_bufs;
-    CCL_THROW_IF_NOT(idx < vec.size(), "coll ", ctype, ", unexpected idx ", idx);
+    auto& vec = (type == ccl_coll_param::buf_type::regular) ? send_bufs : send_dev_bufs;
+    CCL_THROW_IF_NOT(idx < vec.size() || (ctype == ccl_coll_last_value && idx == vec.size()),
+                     "coll ",
+                     ctype,
+                     ", unexpected idx ",
+                     idx);
     return vec[idx];
 }
 
 void* ccl_coll_param::get_recv_buf(size_t idx, ccl_coll_param::buf_type type) const {
-    auto& vec = (type == ccl_coll_param::buf_type::regular) ? recv_bufs : device_recv_bufs;
+    auto& vec = (type == ccl_coll_param::buf_type::regular) ? recv_bufs : recv_dev_bufs;
     CCL_THROW_IF_NOT(idx < vec.size(), "coll ", ctype, ", unexpected idx ", idx);
     return vec[idx];
 }
 
 void* ccl_coll_param::get_send_buf_ptr(size_t idx, ccl_coll_param::buf_type type) const {
-    auto& vec = (type == ccl_coll_param::buf_type::regular) ? send_bufs : device_send_bufs;
+    auto& vec = (type == ccl_coll_param::buf_type::regular) ? send_bufs : send_dev_bufs;
     CCL_THROW_IF_NOT(idx < vec.size(), "coll ", ctype, ", unexpected idx ", idx);
     void* res = (void*)(&vec[idx]);
     return res;
 }
 
 void* ccl_coll_param::get_recv_buf_ptr(size_t idx, ccl_coll_param::buf_type type) const {
-    auto& vec = (type == ccl_coll_param::buf_type::regular) ? recv_bufs : device_recv_bufs;
+    auto& vec = (type == ccl_coll_param::buf_type::regular) ? recv_bufs : recv_dev_bufs;
     CCL_THROW_IF_NOT(idx < vec.size(), "coll ", ctype, ", unexpected idx ", idx);
     void* res = (void*)(&vec[idx]);
     return res;
 }
 
 size_t ccl_coll_param::get_send_count(size_t idx) const {
-    CCL_THROW_IF_NOT(idx < send_counts.size(), "coll ", ctype, ", unexpected idx ", idx);
+    CCL_THROW_IF_NOT(
+        idx < send_counts.size() || (ctype == ccl_coll_last_value && idx == send_counts.size()),
+        "coll ",
+        ctype,
+        ", unexpected idx ",
+        idx);
     return send_counts[idx];
 }
 
@@ -216,26 +242,26 @@ bool ccl_coll_param::is_inplace(buf_type type) const {
         return true;
     }
 
-    void* send_buf = nullptr;
-    void* recv_buf = nullptr;
+    void* send_buf_ptr = nullptr;
+    void* recv_buf_ptr = nullptr;
 
     if ((ctype == ccl_coll_alltoall || ctype == ccl_coll_alltoallv) && (send_bufs.size() > 1)) {
-        send_buf = get_send_buf(comm->rank(), type);
+        send_buf_ptr = get_send_buf(comm->rank(), type);
     }
     else {
-        send_buf = get_send_buf(0, type);
+        send_buf_ptr = get_send_buf(0, type);
     }
 
     if ((ctype == ccl_coll_allgatherv || ctype == ccl_coll_alltoall ||
          ctype == ccl_coll_alltoallv) &&
         (recv_bufs.size() > 1)) {
-        recv_buf = get_recv_buf(comm->rank(), type);
+        recv_buf_ptr = get_recv_buf(comm->rank(), type);
     }
     else {
-        recv_buf = get_recv_buf(0, type);
+        recv_buf_ptr = get_recv_buf(0, type);
     }
 
-    return (send_buf && (send_buf == recv_buf)) ? true : false;
+    return (send_buf_ptr && (send_buf_ptr == recv_buf_ptr)) ? true : false;
 }
 
 std::vector<void*> ccl_coll_param::get_all_non_zero_bufs() const {
@@ -247,11 +273,13 @@ std::vector<void*> ccl_coll_param::get_all_non_zero_bufs() const {
                 including nullptr and invalid pointer
                 don't validate nor dereference it
             */
-            if (std::accumulate(send_counts.begin(), send_counts.end(), 0) > 0) {
+            if (std::accumulate(
+                    send_counts.begin(), send_counts.end(), ccl::utils::initial_count_value) > 0) {
                 bufs.push_back(get_send_buf());
             }
 
-            if (std::accumulate(recv_counts.begin(), recv_counts.end(), 0) > 0) {
+            if (std::accumulate(
+                    recv_counts.begin(), recv_counts.end(), ccl::utils::initial_count_value) > 0) {
                 bufs.push_back(get_recv_buf());
             }
             break;
@@ -261,7 +289,8 @@ std::vector<void*> ccl_coll_param::get_all_non_zero_bufs() const {
                 bufs.push_back(get_send_buf());
             }
 
-            if (std::accumulate(recv_counts.begin(), recv_counts.end(), 0) > 0) {
+            if (std::accumulate(
+                    recv_counts.begin(), recv_counts.end(), ccl::utils::initial_count_value) > 0) {
                 if (recv_bufs.size() == 1) {
                     bufs.push_back(get_recv_buf());
                 }
@@ -297,15 +326,16 @@ void ccl_coll_param::validate() const {
         return;
     }
 
-    LOG_TRACE("validate coll_param, coll: ", ccl_coll_type_to_str(ctype));
-    CCL_THROW_IF_NOT(!send_counts.empty(), "empty send_counts");
-    CCL_THROW_IF_NOT(!recv_counts.empty(), "empty recv_counts");
-    CCL_THROW_IF_NOT(comm, "null comm");
-
     if (ctype == ccl_coll_barrier) {
         return;
     }
 
+    LOG_TRACE("validate coll_param, coll: ", ccl_coll_type_to_str(ctype));
+    CCL_THROW_IF_NOT(
+        !send_counts.empty(), "empty send_counts: ctype: ", ccl_coll_type_to_str(ctype));
+    CCL_THROW_IF_NOT(
+        !recv_counts.empty(), "empty recv_counts ctype: ", ccl_coll_type_to_str(ctype));
+
     CCL_THROW_IF_NOT(!send_bufs.empty(), "empty send_bufs");
     CCL_THROW_IF_NOT(!recv_bufs.empty(), "empty recv_bufs");
 
@@ -688,6 +718,7 @@ ccl_coll_param ccl_coll_param::create_recv_param(void* recv_buf,
     param.recv_bufs.push_back(recv_buf);
     param.recv_counts.push_back(recv_count);
     param.peer_rank = peer_rank;
+    param.is_pt2pt = true;
     param.set_common_fields(dtype, comm, stream, deps);
     param.validate();
 
@@ -710,6 +741,7 @@ ccl_coll_param ccl_coll_param::create_send_param(const void* send_buf,
     param.recv_bufs.push_back((void*)send_buf);
     param.recv_counts.push_back(send_count);
     param.peer_rank = peer_rank;
+    param.is_pt2pt = true;
     param.set_common_fields(dtype, comm, stream, deps);
     param.validate();
 
diff --git a/src/coll/coll_param.hpp b/src/coll/coll_param.hpp
index b3700cac3..fa1e3bb7d 100644
--- a/src/coll/coll_param.hpp
+++ b/src/coll/coll_param.hpp
@@ -19,6 +19,7 @@
 
 #include "coll/algorithms/algorithm_utils.hpp"
 #include "common/datatype/datatype.hpp"
+#include "common/utils/buffer.hpp"
 #include "oneapi/ccl.hpp"
 
 class ccl_comm;
@@ -47,6 +48,7 @@ using ccl_sycl_buffer_one_dim_types = std::tuple<ccl_sycl_typed_buffer_t<int8_t>
 #define CCL_INVALID_GROUP_IDX     (-1)
 #define CCL_INVALID_PROC_IDX      (-1)
 #define CCL_INVALID_PEER_RANK_IDX (-1)
+#define CCL_INVALID_ROOT_RANK_IDX (-1)
 
 struct ccl_coll_attr {
     ccl_coll_attr() = default;
@@ -88,35 +90,53 @@ struct ccl_coll_attr {
 struct ccl_coll_param {
     enum class buf_type { regular, device };
 
-    ccl_coll_type ctype;
+    ccl_coll_type ctype = ccl_coll_last_value;
+    ccl_coll_algo hint_algo{};
 
-    std::vector<void*> send_bufs;
-    std::vector<void*> recv_bufs;
+    // for ccl_coll_build_<coll> of build_sched
+    ccl_buffer send_buf{};
+    ccl_buffer recv_buf{};
+
+    // in case of: ccl_coll_param::create_<coll>_param
+    std::vector<void*> send_bufs{};
+    std::vector<void*> recv_bufs{};
+
+    // for host transfer in add_scaleout case
+    // of topo algos in coll_param.cpp
+    std::vector<ccl_buffer> send_scale_out_bufs{};
+    std::vector<ccl_buffer> recv_scale_out_bufs{};
 
     /*
         filled if pre-post copy is used
         to keep original send/recv buffers
         send_buf and recv_buf fields are replaced by staging buffers
     */
-    std::vector<void*> device_send_bufs;
-    std::vector<void*> device_recv_bufs;
+    std::vector<void*> send_dev_bufs{};
+    std::vector<void*> recv_dev_bufs{};
+
+    std::vector<size_t> send_counts{};
+    std::vector<size_t> recv_counts{};
+    size_t send_count{};
+    size_t count{};
 
-    std::vector<size_t> send_counts;
-    std::vector<size_t> recv_counts;
+    ccl_datatype dtype = {};
+    ccl::reduction reduction = ccl::reduction::sum;
+    int root = CCL_INVALID_ROOT_RANK_IDX, peer_rank = CCL_INVALID_PEER_RANK_IDX;
 
-    ccl_datatype dtype;
-    ccl::reduction reduction;
-    int root, peer_rank = CCL_INVALID_PEER_RANK_IDX;
     int group_id = CCL_INVALID_GROUP_IDX;
-    ccl_stream* stream;
-    ccl_comm* comm;
-    std::vector<ccl::event> deps;
-    bool is_scaleout;
 
-    ccl_coll_param();
+    ccl_stream* stream = nullptr;
+    ccl_comm* comm = nullptr;
+
+    std::vector<ccl::event> deps{};
+    bool is_scaleout{ false };
+    bool is_validate{ true };
+    bool is_pt2pt{ false };
+
+    ccl_coll_param(bool in_is_validate = true);
     ccl_coll_param(const ccl_coll_param& other);
     ccl_coll_param& operator=(const ccl_coll_param& other) {
-        if (this != &other) {
+        if (this != &other && is_validate) {
             copy(other);
         }
         return *this;
diff --git a/src/coll/coll_util.cpp b/src/coll/coll_util.cpp
index d570a9b20..5f62cc0d3 100644
--- a/src/coll/coll_util.cpp
+++ b/src/coll/coll_util.cpp
@@ -19,11 +19,12 @@
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
 #include "sched/entry/ze/ze_event_signal_entry.hpp"
 #include "sched/entry/ze/ze_event_wait_entry.hpp"
+#include "sched/entry/ze/ze_pt2pt_barrier_entry.hpp"
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 namespace ccl {
 
-void add_coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param) {
+void add_coll_entry(ccl_sched* sched, const ccl_coll_param& param) {
     ccl_selector_param selector_param;
 
     if (param.ctype == ccl_coll_send || param.ctype == ccl_coll_recv) {
@@ -35,7 +36,8 @@ void add_coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param) {
         if (param.ctype == ccl_coll_allgatherv) {
             selector_param.count = param.send_count;
         }
-        selector_param.recv_counts = param.recv_counts;
+        selector_param.recv_counts =
+            const_cast<size_t*>(reinterpret_cast<const size_t*>(param.recv_counts.data()));
         selector_param.dtype = param.dtype;
         selector_param.comm = param.comm;
         selector_param.stream = param.stream;
@@ -45,6 +47,7 @@ void add_coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param) {
         selector_param.is_sycl_buf = sched->coll_attr.is_sycl_buf;
 #endif // CCL_ENABLE_SYCL
         selector_param.hint_algo = param.hint_algo;
+        selector_param.peer_rank = param.peer_rank;
         selector_param.is_scaleout = param.is_scaleout;
 
 #ifdef CCL_ENABLE_SYCL
@@ -114,7 +117,7 @@ void add_comm_barrier(ccl_sched* sched,
         entry_factory::create<ze_barrier_entry>(sched, comm, ipc_pool, ipc_event_idx);
     }
     else {
-        ccl_coll_entry_param barrier_param{};
+        ccl_coll_param barrier_param{};
         barrier_param.ctype = ccl_coll_barrier;
         barrier_param.comm = comm;
 
@@ -149,30 +152,43 @@ void add_handle_exchange(ccl_sched* sched,
                          const std::vector<ze_handle_exchange_entry::mem_desc_t>& in_buffers,
                          int skip_rank,
                          ze_event_pool_handle_t pool,
-                         size_t event_idx) {
+                         size_t event_idx,
+                         const ccl::utils::pt2pt_handle_exchange_info& info) {
     if (!wait_events.empty()) {
         ccl::add_wait_events(sched, wait_events);
     }
     if (sched->coll_attr.to_cache) {
         sched->set_entry_exec_mode(ccl_sched_entry_exec_once);
-        entry_factory::create<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank);
+        entry_factory::create<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank, info);
         sched->add_barrier();
         sched->set_entry_exec_mode(ccl_sched_entry_exec_regular);
 
-        // TODO: no need barrier for the first iteration where ze_handle_exchange_entry exists
-        add_comm_barrier(sched, comm, {}, out_event, pool, event_idx);
+        if (sched->coll_param.ctype == ccl_coll_recv || sched->coll_param.ctype == ccl_coll_send) {
+            // the entry emulates 'alignment' between send and recv op-s
+            // the 'alignment' is needed to avoid situations when send or recv
+            // can be faster executed (e.g. initialize the send buffer), not
+            // waiting the previous pair of op-s to be finished. it must be as
+            // entry because 'alignment' must be in the schedule with other
+            // entries. such 'alignment' should be exectuted before other
+            // entries that's why sched->add_barrier is used.
+            entry_factory::create<ze_pt2pt_barrier_entry>(sched, comm, info.peer_rank);
+            sched->add_barrier();
+        }
+        else {
+            // TODO: no need barrier for the first iteration where ze_handle_exchange_entry exists
+            add_comm_barrier(sched, comm, {}, out_event, pool, event_idx);
+        }
     }
     else {
-        entry_factory::create<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank);
+        entry_factory::create<ze_handle_exchange_entry>(sched, comm, in_buffers, skip_rank, info);
         sched->add_barrier();
         out_event = ccl::add_signal_event(sched);
     }
 }
 
-void add_coll(ccl_sched* sched,
-              const ccl_coll_entry_param& param,
-              const std::vector<ze_event_handle_t>& wait_events,
-              ze_event_handle_t& out_event) {
+ze_event_handle_t add_coll(ccl_sched* sched,
+                           const ccl_coll_param& param,
+                           const std::vector<ze_event_handle_t>& wait_events) {
     if (sched->use_single_list) {
         ccl::add_wait_events(sched, wait_events);
     }
@@ -191,6 +207,17 @@ void add_coll(ccl_sched* sched,
                                                                     param.stream);
                 break;
             }
+            case ccl_coll_reduce_scatter: {
+                coll_param = ccl_coll_param::create_reduce_scatter_param(param.send_buf.get_src(),
+                                                                         param.recv_buf.get_src(),
+                                                                         param.count,
+                                                                         param.dtype.idx(),
+                                                                         param.reduction,
+                                                                         attr,
+                                                                         param.comm,
+                                                                         param.stream);
+                break;
+            }
             case ccl_coll_reduce: {
                 coll_param = ccl_coll_param::create_reduce_param(param.send_buf.get_src(),
                                                                  param.recv_buf.get_src(),
@@ -205,9 +232,9 @@ void add_coll(ccl_sched* sched,
             }
             case ccl_coll_alltoallv: {
                 coll_param = ccl_coll_param::create_alltoallv_param(param.send_buf.get_src(),
-                                                                    param.send_counts,
+                                                                    param.send_counts.data(),
                                                                     param.recv_buf.get_src(),
-                                                                    param.recv_counts,
+                                                                    param.recv_counts.data(),
                                                                     param.dtype.idx(),
                                                                     attr,
                                                                     param.comm,
@@ -218,7 +245,7 @@ void add_coll(ccl_sched* sched,
                 coll_param = ccl_coll_param::create_allgatherv_param(param.send_buf.get_src(),
                                                                      param.send_count,
                                                                      param.recv_buf.get_src(),
-                                                                     param.recv_counts,
+                                                                     param.recv_counts.data(),
                                                                      param.dtype.idx(),
                                                                      attr,
                                                                      param.comm,
@@ -240,10 +267,8 @@ void add_coll(ccl_sched* sched,
     }
     sched->add_barrier();
 
-    if (out_event) {
-        auto signal_event = ccl::add_signal_event(sched);
-        out_event = signal_event;
-    }
+    auto out_event = ccl::add_signal_event(sched);
+    return out_event;
 }
 
 ze_event_handle_t add_copy_entry(ccl_buffer src,
@@ -261,12 +286,14 @@ ze_event_handle_t add_copy_entry(ccl_buffer src,
 
 ze_event_handle_t add_copy_entry_with_offset(std::vector<ccl_buffer> bufs,
                                              ccl_buffer buf,
-                                             const size_t* counts,
-                                             const size_t counts_size,
+                                             const std::vector<size_t> counts,
+                                             ccl_comm* comm,
                                              const ccl_datatype dtype,
                                              const copy_attr& copy_attr,
                                              ccl_sched* sched,
-                                             const std::vector<ze_event_handle_t>& wait_events) {
+                                             const std::vector<ze_event_handle_t>& wait_events,
+                                             bool is_skip_own_rank = false) {
+    const size_t counts_size = comm->size();
     CCL_THROW_IF_NOT(bufs.size() == counts_size,
                      "buffers number is different from the number of counts");
     size_t offset = 0;
@@ -277,17 +304,19 @@ ze_event_handle_t add_copy_entry_with_offset(std::vector<ccl_buffer> bufs,
             continue;
         }
 
-        ccl_buffer src = bufs[idx];
-        ccl_buffer dst = buf + offset;
-        // reverse the function logic (src <=> dst)
-        if (copy_attr.direction == copy_direction::h2d) {
-            src = buf + offset;
-            dst = bufs[idx];
+        if (!is_skip_own_rank || idx != size_t(comm->rank())) {
+            ccl_buffer src = bufs[idx];
+            ccl_buffer dst = buf + offset;
+            // reverse the function logic (src <=> dst)
+            if (copy_attr.direction == copy_direction::h2d) {
+                src = buf + offset;
+                dst = bufs[idx];
+            }
+            auto out_event =
+                add_copy_entry(src, dst, counts[idx], dtype, copy_attr, sched, wait_events);
+            out_events.push_back(out_event);
         }
-        auto out_event =
-            add_copy_entry(src, dst, counts[idx], dtype, copy_attr, sched, wait_events);
         offset += counts[idx] * dtype.size();
-        out_events.push_back(out_event);
     }
     LOG_DEBUG("add_copy_entry_with_offset done");
 
@@ -295,8 +324,8 @@ ze_event_handle_t add_copy_entry_with_offset(std::vector<ccl_buffer> bufs,
     return add_signal_event(sched);
 }
 
-ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_entry_param& in_coll_param,
-                                           ccl_coll_entry_param& out_coll_param,
+ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_param& in_coll_param,
+                                           ccl_coll_param& out_coll_param,
                                            ccl_sched* sched,
                                            const std::vector<ze_event_handle_t>& wait_events) {
     ze_event_handle_t out_event{};
@@ -312,21 +341,26 @@ ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_entry_param& in_coll_p
 
     // calculate counts
     if (ctype == ccl_coll_alltoallv) {
-        size_t a2av_send_count = std::accumulate(
-            out_coll_param.send_counts, out_coll_param.send_counts + counts_size, 0);
-        size_t a2av_recv_count = std::accumulate(
-            out_coll_param.recv_counts, out_coll_param.recv_counts + counts_size, 0);
+        size_t a2av_send_count = std::accumulate(out_coll_param.send_counts.begin(),
+                                                 out_coll_param.send_counts.end(),
+                                                 ccl::utils::initial_count_value);
+        size_t a2av_recv_count = std::accumulate(out_coll_param.recv_counts.begin(),
+                                                 out_coll_param.recv_counts.end(),
+                                                 ccl::utils::initial_count_value);
         a2av_send_bytes = a2av_send_count * dtype_size;
         a2av_recv_bytes = a2av_recv_count * dtype_size;
     }
     else if (ctype == ccl_coll_alltoall || ctype == ccl_coll_allgatherv) {
         // assume sum of send_counts and recv_counts are equal
-        host_buf_size =
-            std::accumulate(
-                out_coll_param.recv_counts, out_coll_param.recv_counts + counts_size, 0) *
-            dtype_size;
+        host_buf_size = std::accumulate(out_coll_param.recv_counts.begin(),
+                                        out_coll_param.recv_counts.end(),
+                                        ccl::utils::initial_count_value) *
+                        dtype_size;
         LOG_DEBUG("alltoall(v)/allgatherv scale_out host buf size: ", host_buf_size);
     }
+    else if (ctype == ccl_coll_reduce_scatter) {
+        host_buf_size = out_coll_param.count * counts_size * dtype_size;
+    }
     else {
         host_buf_size = out_coll_param.count * dtype_size;
     }
@@ -349,40 +383,41 @@ ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_entry_param& in_coll_p
         out_coll_param.recv_buf = out_coll_param.send_buf;
     }
 
-    // transform array of buffers in contiguos buffer with offsets
+    // transform array of buffers in contiguous buffer with offsets
     if (ctype == ccl_coll_alltoallv) {
-        auto send_out_event = add_copy_entry_with_offset(in_coll_param.send_bufs,
+        auto send_out_event = add_copy_entry_with_offset(in_coll_param.send_scale_out_bufs,
                                                          out_coll_param.send_buf,
                                                          out_coll_param.send_counts,
-                                                         counts_size,
+                                                         out_coll_param.comm,
                                                          out_coll_param.dtype,
                                                          copy_attr(copy_direction::d2h),
                                                          sched,
                                                          wait_events);
-        out_event = add_copy_entry_with_offset(in_coll_param.recv_bufs,
+        out_event = add_copy_entry_with_offset(in_coll_param.recv_scale_out_bufs,
                                                out_coll_param.recv_buf,
                                                out_coll_param.recv_counts,
-                                               counts_size,
+                                               out_coll_param.comm,
                                                out_coll_param.dtype,
                                                copy_attr(copy_direction::d2h),
                                                sched,
                                                std::vector<ze_event_handle_t>{ send_out_event });
     }
     else if (ctype == ccl_coll_alltoall) {
-        out_event = add_copy_entry_with_offset(in_coll_param.send_bufs,
+        out_event = add_copy_entry_with_offset(in_coll_param.send_scale_out_bufs,
                                                out_coll_param.send_buf,
                                                out_coll_param.send_counts,
-                                               counts_size,
+                                               out_coll_param.comm,
                                                out_coll_param.dtype,
                                                copy_attr(copy_direction::d2h),
                                                sched,
                                                wait_events);
     }
     else if (ctype == ccl_coll_allgatherv) {
-        size_t offset = std::accumulate(out_coll_param.recv_counts,
-                                        out_coll_param.recv_counts + out_coll_param.comm->rank(),
-                                        0) *
-                        dtype_size;
+        size_t offset =
+            std::accumulate(out_coll_param.recv_counts.begin(),
+                            out_coll_param.recv_counts.begin() + out_coll_param.comm->rank(),
+                            ccl::utils::initial_count_value) *
+            dtype_size;
         out_event = add_copy_entry(in_coll_param.send_buf,
                                    out_coll_param.send_buf + offset,
                                    out_coll_param.send_count,
@@ -391,6 +426,15 @@ ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_entry_param& in_coll_p
                                    sched,
                                    wait_events);
     }
+    else if (ctype == ccl_coll_reduce_scatter) {
+        out_event = add_copy_entry(in_coll_param.send_buf,
+                                   out_coll_param.send_buf,
+                                   out_coll_param.count * counts_size,
+                                   out_coll_param.dtype,
+                                   copy_attr(copy_direction::d2h),
+                                   sched,
+                                   wait_events);
+    }
     else {
         out_event = add_copy_entry(in_coll_param.send_buf,
                                    out_coll_param.send_buf,
@@ -404,7 +448,7 @@ ze_event_handle_t fill_scaleout_coll_param(const ccl_coll_entry_param& in_coll_p
 }
 
 void add_scaleout(ccl_sched* sched,
-                  const ccl_coll_entry_param& in_coll_param,
+                  const ccl_coll_param& in_coll_param,
                   const bool is_single_node,
                   const std::vector<ze_event_handle_t>& in_wait_events,
                   ze_event_handle_t& out_event,
@@ -413,14 +457,16 @@ void add_scaleout(ccl_sched* sched,
                   ccl_buffer global_recv_buf,
                   int global_root) {
     std::vector<ze_event_handle_t> wait_events{ in_wait_events };
-    ccl_coll_entry_param coll_param(in_coll_param);
+    ccl_coll_param coll_param(in_coll_param);
     out_event = nullptr;
 
-    bool multi_node = (!is_single_node && (coll_param.count || coll_param.recv_counts));
+    bool multi_node =
+        (!is_single_node && (coll_param.count != 0 || coll_param.recv_counts.size() != 0));
     bool enable_hmem = (ccl::global_data::env().use_hmem && atl_base_comm::attr.out.enable_hmem);
     bool do_h2d_copy =
-        ((coll_param.ctype == ccl_coll_allreduce || coll_param.ctype == ccl_coll_alltoallv ||
-          coll_param.ctype == ccl_coll_alltoall || coll_param.ctype == ccl_coll_allgatherv) &&
+        ((coll_param.ctype == ccl_coll_allreduce || coll_param.ctype == ccl_coll_reduce_scatter ||
+          coll_param.ctype == ccl_coll_alltoallv || coll_param.ctype == ccl_coll_alltoall ||
+          coll_param.ctype == ccl_coll_allgatherv) &&
          multi_node && !enable_hmem) ||
         (coll_param.ctype == ccl_coll_reduce && coll_param.comm->rank() == coll_param.root);
 
@@ -428,7 +474,7 @@ void add_scaleout(ccl_sched* sched,
         if (!enable_hmem) {
             LOG_DEBUG("topo/scale_out: use host_", ccl_coll_type_to_str(coll_param.ctype));
 
-            // mostly initialize contiguos send/recv buffers from array of input buffers
+            // mostly initialize contiguous send/recv buffers from array of input buffers
             out_event = fill_scaleout_coll_param(in_coll_param, coll_param, sched, wait_events);
             utils::clear_and_push_back(wait_events, out_event);
             sched->add_barrier();
@@ -440,7 +486,8 @@ void add_scaleout(ccl_sched* sched,
         // pass the scale-out selection param directly
         coll_param.is_scaleout = true;
         // do inplace collective
-        ccl::add_coll(sched, coll_param, wait_events, out_event);
+
+        out_event = ccl::add_coll(sched, coll_param, wait_events);
         utils::clear_and_push_back(wait_events, out_event);
     }
 
@@ -458,14 +505,15 @@ void add_scaleout(ccl_sched* sched,
 
     if (coll_param.ctype == ccl_coll_alltoallv || coll_param.ctype == ccl_coll_alltoall ||
         coll_param.ctype == ccl_coll_allgatherv) {
-        out_event = add_copy_entry_with_offset(in_coll_param.recv_bufs,
+        out_event = add_copy_entry_with_offset(in_coll_param.recv_scale_out_bufs,
                                                coll_param.recv_buf,
                                                coll_param.recv_counts,
-                                               coll_param.comm->size(),
+                                               coll_param.comm,
                                                coll_param.dtype,
                                                h2d_copy_attr,
                                                sched,
-                                               wait_events);
+                                               wait_events,
+                                               coll_param.ctype == ccl_coll_allgatherv);
     }
     else {
         out_event = add_copy_entry(src_copy_buf,
diff --git a/src/coll/coll_util.hpp b/src/coll/coll_util.hpp
index 0e9bf3a24..86170b4b1 100644
--- a/src/coll/coll_util.hpp
+++ b/src/coll/coll_util.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include "common/global/global.hpp"
-#include "sched/entry/coll/coll_entry_param.hpp"
 #include "sched/entry/copy/copy_helper.hpp"
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
 #include "sched/entry/ze/ze_handle_exchange_entry.hpp"
@@ -24,7 +23,7 @@
 
 namespace ccl {
 
-void add_coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param);
+void add_coll_entry(ccl_sched* sched, const ccl_coll_param& param);
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
 static constexpr int invalid_host_buf_size = 0;
@@ -47,15 +46,15 @@ void add_handle_exchange(ccl_sched* sched,
                          const std::vector<ze_handle_exchange_entry::mem_desc_t>& in_buffers,
                          int skip_rank = ccl_comm::invalid_rank,
                          ze_event_pool_handle_t pool = nullptr,
-                         size_t event_idx = 0);
+                         size_t event_idx = 0,
+                         const ccl::utils::pt2pt_handle_exchange_info& info = {});
 
-void add_coll(ccl_sched* sched,
-              const ccl_coll_entry_param& param,
-              const std::vector<ze_event_handle_t>& wait_events,
-              ze_event_handle_t& out_event);
+ze_event_handle_t add_coll(ccl_sched* sched,
+                           const ccl_coll_param& param,
+                           const std::vector<ze_event_handle_t>& wait_events);
 
 void add_scaleout(ccl_sched* sched,
-                  const ccl_coll_entry_param& in_coll_param,
+                  const ccl_coll_param& in_coll_param,
                   const bool is_single_node,
                   const std::vector<ze_event_handle_t>& wait_events,
                   ze_event_handle_t& out_event,
diff --git a/src/coll/selection/selection.cpp b/src/coll/selection/selection.cpp
index 00fe4c4a8..f02b7d9d9 100644
--- a/src/coll/selection/selection.cpp
+++ b/src/coll/selection/selection.cpp
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "atl/mpi/atl_mpi_ctx.hpp"
 #include "coll/selection/selection.hpp"
 #include "comm/comm.hpp"
 #include "common/global/global.hpp"
@@ -30,7 +31,11 @@ std::string to_string(const ccl_selector_param& param) {
        << ", dt: " << ccl::global_data::get().dtypes->name(param.dtype);
 
     if (param.comm) {
-        ss << ", comm: { rank: " << param.comm->rank() << ", size: " << param.comm->size() << " }";
+        ss << ", comm: { rank: " << param.comm->rank() << ", size: " << param.comm->size();
+        if (param.ctype == ccl_coll_recv || param.ctype == ccl_coll_send) {
+            ss << ", peer_rank: " << param.peer_rank;
+        }
+        ss << " }";
     }
 
     if (param.stream) {
@@ -83,12 +88,33 @@ bool ccl_is_direct_algo(const ccl_selector_param& param) {
     else if (param.ctype == ccl_coll_bcast) {
         res = (selector->get<ccl_coll_bcast>(param) == ccl_coll_bcast_direct);
     }
+    else if (param.ctype == ccl_coll_recv) {
+        res = (selector->get<ccl_coll_recv>(param) == ccl_coll_recv_direct);
+    }
     else if (param.ctype == ccl_coll_reduce) {
         res = (selector->get<ccl_coll_reduce>(param) == ccl_coll_reduce_direct);
     }
     else if (param.ctype == ccl_coll_reduce_scatter) {
         res = (selector->get<ccl_coll_reduce_scatter>(param) == ccl_coll_reduce_scatter_direct);
     }
+    else if (param.ctype == ccl_coll_send) {
+        res = (selector->get<ccl_coll_send>(param) == ccl_coll_send_direct);
+    }
+
+    return res;
+}
+
+bool ccl_is_offload_pt2pt_algo(const ccl_selector_param& param) {
+    bool res = false;
+
+    auto& selector = ccl::global_data::get().algorithm_selector;
+
+    if (param.ctype == ccl_coll_recv) {
+        res = (selector->get<ccl_coll_recv>(param) == ccl_coll_recv_offload);
+    }
+    else if (param.ctype == ccl_coll_send) {
+        res = (selector->get<ccl_coll_send>(param) == ccl_coll_send_offload);
+    }
 
     return res;
 }
@@ -183,6 +209,9 @@ static bool ccl_is_device_side_algo(ccl_coll_algo algo, const ccl_selector_param
     else if (param.ctype == ccl_coll_bcast) {
         return algo.bcast == ccl_coll_bcast_topo;
     }
+    else if (param.ctype == ccl_coll_recv) {
+        return algo.recv == ccl_coll_recv_topo;
+    }
     else if (param.ctype == ccl_coll_reduce) {
         return algo.reduce == ccl_coll_reduce_topo;
     }
@@ -191,10 +220,13 @@ static bool ccl_is_device_side_algo(ccl_coll_algo algo, const ccl_selector_param
 #ifdef CCL_ENABLE_SYCL
         return algo.reduce_scatter == ccl_coll_reduce_scatter_topo &&
                ccl::global_data::env().enable_ze_bidir_algo;
-#else
+#else // CCL_ENABLE_SYCL
         return algo.reduce_scatter == ccl_coll_reduce_scatter_topo;
 #endif // CCL_ENABLE_SYCL
     }
+    else if (param.ctype == ccl_coll_send) {
+        return algo.send == ccl_coll_send_topo;
+    }
 
     return false;
 }
@@ -204,9 +236,9 @@ bool ccl_is_device_side_algo(const ccl_selector_param& param) {
     return false;
 #endif // CCL_ENABLE_SYCL
 
-    auto supported_colls = { ccl_coll_allgatherv,    ccl_coll_allreduce, ccl_coll_alltoall,
-                             ccl_coll_alltoallv,     ccl_coll_bcast,     ccl_coll_reduce,
-                             ccl_coll_reduce_scatter };
+    auto supported_colls = { ccl_coll_allgatherv, ccl_coll_allreduce,      ccl_coll_alltoall,
+                             ccl_coll_alltoallv,  ccl_coll_bcast,          ccl_coll_recv,
+                             ccl_coll_reduce,     ccl_coll_reduce_scatter, ccl_coll_send };
     RETURN_FALSE_IF(!checkers::is_coll_supported(supported_colls, param.ctype),
                     "coll ",
                     ccl_coll_type_to_str(param.ctype),
@@ -230,12 +262,18 @@ bool ccl_is_device_side_algo(const ccl_selector_param& param) {
     else if (param.ctype == ccl_coll_bcast) {
         algo.bcast = selector->get<ccl_coll_bcast>(param);
     }
+    else if (param.ctype == ccl_coll_recv) {
+        algo.recv = selector->get<ccl_coll_recv>(param);
+    }
     else if (param.ctype == ccl_coll_reduce) {
         algo.reduce = selector->get<ccl_coll_reduce>(param);
     }
     else if (param.ctype == ccl_coll_reduce_scatter) {
         algo.reduce_scatter = selector->get<ccl_coll_reduce_scatter>(param);
     }
+    else if (param.ctype == ccl_coll_send) {
+        algo.send = selector->get<ccl_coll_send>(param);
+    }
 
     return ccl_is_device_side_algo(algo, param);
 }
@@ -247,9 +285,9 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     return false;
 #endif // CCL_ENABLE_SYCL
 
-    auto supported_colls = { ccl_coll_allgatherv,    ccl_coll_allreduce, ccl_coll_alltoall,
-                             ccl_coll_alltoallv,     ccl_coll_bcast,     ccl_coll_reduce,
-                             ccl_coll_reduce_scatter };
+    auto supported_colls = { ccl_coll_allgatherv, ccl_coll_allreduce,      ccl_coll_alltoall,
+                             ccl_coll_alltoallv,  ccl_coll_bcast,          ccl_coll_recv,
+                             ccl_coll_reduce,     ccl_coll_reduce_scatter, ccl_coll_send };
     RETURN_FALSE_IF(!checkers::is_coll_supported(supported_colls, param.ctype),
                     "coll is not supported");
 
@@ -293,9 +331,24 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     RETURN_FALSE_IF(!param.comm->get_topo_manager().has_same_domains(),
                     "processes are not properly distributed among domains");
 
-    if (!ccl::global_data::env().ze_disable_oversubscription_check) {
-        RETURN_FALSE_IF(param.comm->get_topo_manager().has_oversubscription(),
-                        "oversubscription case: one rank per device is only supported");
+    if (comm_size > 2 && !(param.ctype == ccl_coll_recv || param.ctype == ccl_coll_send)) {
+        if (ccl::global_data::env().ze_enable_oversubscription_throw) {
+            CCL_THROW_IF_NOT(
+                !param.comm->get_topo_manager().has_oversubscription(),
+                "oversubscription case is detected: \n OneCCL expects max one rank per device, "
+                " but count of unique devices: ",
+                param.comm->get_topo_manager().get_unique_device_uuids_count(),
+                ", comm_size: ",
+                comm_size,
+                "\n specify comm_size to: ",
+                param.comm->get_topo_manager().get_unique_device_uuids_count());
+        }
+        else {
+            if (ccl::global_data::env().ze_enable_oversubscription_fallback) {
+                RETURN_FALSE_IF(param.comm->get_topo_manager().has_oversubscription(),
+                                "oversubscription case: one rank per device is only supported");
+            }
+        }
     }
 
     RETURN_FALSE_IF(!ccl::global_data::env().enable_ze_bidir_algo &&
@@ -358,8 +411,62 @@ bool ccl_can_use_topo_algo(const ccl_selector_param& param) {
     RETURN_FALSE_IF(!checkers::is_single_card(param) && !checkers::is_single_node(param) &&
                         (local_proc_count % 2 != 0),
                     "odd proc count per node is not supported");
+
     RETURN_FALSE_IF((param.ctype == ccl_coll_reduce) && (param.count < size_t(param.comm->size())),
                     "reduce with count < comm_size not supported");
+
+    if (param.ctype == ccl_coll_recv || param.ctype == ccl_coll_send) {
+        auto node_comm = param.comm->get_node_comm().get();
+        bool peer_rank_in_node_comm = node_comm->try_get_rank_from_global(param.peer_rank);
+        bool rank_in_node_comm = node_comm->try_get_rank_from_global(param.comm->rank());
+
+        RETURN_FALSE_IF(!(rank_in_node_comm && peer_rank_in_node_comm),
+                        "peer_rank must be on the same node as own rank is: comm_rank: ",
+                        param.comm->rank(),
+                        ", peer_rank: ",
+                        param.peer_rank,
+                        ", rank_in_node_comm: ",
+                        rank_in_node_comm,
+                        ", peer_rank_in_node_comm: ",
+                        peer_rank_in_node_comm,
+                        ", node_comm_size: ",
+                        node_comm->size());
+
+        if (ccl::global_data::env().recv_algo_raw.length() != 0 &&
+            ccl::global_data::env().send_algo_raw.length() != 0) {
+            auto recv_algo = ccl_algorithm_selector_helper<ccl_coll_recv_algo>::algo_from_str(
+                ccl::global_data::env().recv_algo_raw);
+            auto send_algo = ccl_algorithm_selector_helper<ccl_coll_send_algo>::algo_from_str(
+                ccl::global_data::env().send_algo_raw);
+            RETURN_FALSE_IF(
+                (recv_algo == ccl_coll_recv_direct) || (send_algo == ccl_coll_send_direct),
+                " pt2pt operations algo must be the same: CCL_SEND=",
+                ccl::global_data::env().send_algo_raw,
+                ", CCL_RECV=",
+                ccl::global_data::env().recv_algo_raw);
+        }
+
+#ifdef CCL_ENABLE_SYCL
+        auto rank_color = param.comm->get_topo_manager().get_intra_card_color(param.comm->rank());
+        auto peer_rank_color = param.comm->get_topo_manager().get_intra_card_color(param.peer_rank);
+
+        if (rank_color == peer_rank_color && !ccl::global_data::env().ze_pt2pt_read) {
+            ccl::global_data::env().ze_pt2pt_read = 1;
+            LOG_DEBUG("pt2pt: force read algo for within card execution case:"
+                      " { color: ",
+                      rank_color,
+                      ", rank: ",
+                      node_comm->rank(),
+                      " },"
+                      " { peer_color: ",
+                      peer_rank_color,
+                      ", peer_rank: ",
+                      param.peer_rank,
+                      " }");
+        }
+#endif // CCL_ENABLE_SYCL
+    }
+
     return true;
 }
 
@@ -397,3 +504,32 @@ bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param) {
 
     return can_use;
 }
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+void set_offload_pt2pt_mpi_env() {
+    auto lib_attr = atl_mpi_ctx::get_lib_attr();
+    if (lib_attr.type == atl_mpi_ctx::ATL_MPI_LIB_IMPI && lib_attr.hmem == 1) {
+        setenv("I_MPI_OFFLOAD", "2", 0);
+        LOG_DEBUG("IMPI case: I_MPI_OFFLOAD is set");
+    }
+    else if (lib_attr.type == atl_mpi_ctx::ATL_MPI_LIB_MPICH && lib_attr.hmem == 1) {
+        setenv("MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE", "1", 0);
+        setenv("MPIR_CVAR_CH4_OFI_GPU_PIPELINE_MAX_NUM_BUFFERS", "8", 0);
+        setenv("MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK", "4", 0);
+        setenv("MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ", "524288", 0);
+        setenv("MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE", "1", 0);
+        setenv("MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE", "1", 0);
+        LOG_DEBUG("MPIR case: MPIR_CVAR_ENABLE_GPU is set in MPICH internally");
+    }
+}
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+bool use_pt2pt_offload_algo() {
+    bool res = true;
+    const char* env_value = std::getenv("PSM3_GPUDIRECT");
+    if ((env_value == nullptr || std::strcmp(env_value, "0") == 0) &&
+        ccl::global_data::env().atl_transport == ccl_atl_ofi) {
+        res = false;
+    }
+    return res;
+}
diff --git a/src/coll/selection/selection.hpp b/src/coll/selection/selection.hpp
index 9f3fc34dc..6e29161ea 100644
--- a/src/coll/selection/selection.hpp
+++ b/src/coll/selection/selection.hpp
@@ -19,7 +19,17 @@
 
 bool ccl_is_direct_algo(const ccl_selector_param& param);
 bool ccl_is_device_side_algo(const ccl_selector_param& param);
+bool ccl_is_offload_pt2pt_algo(const ccl_selector_param& param);
 
 bool ccl_can_use_topo_algo(const ccl_selector_param& param);
 
 bool ccl_can_use_datatype(ccl_coll_algo algo, const ccl_selector_param& param);
+
+// utils
+// pt2pt: send or recv is considered like a unique "collective"
+// operation, that's why send or recv has own selector, int this case
+// we have to set the same env in send and recv
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+void set_offload_pt2pt_mpi_env();
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+bool use_pt2pt_offload_algo();
diff --git a/src/coll/selection/selector.hpp b/src/coll/selection/selector.hpp
index 9d2dcdb1b..094803cc2 100644
--- a/src/coll/selection/selector.hpp
+++ b/src/coll/selection/selector.hpp
@@ -51,6 +51,8 @@ struct ccl_selector_param {
     int is_sycl_buf = 0;
 #endif // CCL_ENABLE_SYCL
 
+    int peer_rank = CCL_INVALID_PEER_RANK_IDX;
+
     ccl_coll_algo hint_algo = {};
 
     bool is_scaleout = false;
@@ -66,21 +68,19 @@ using ccl_selection_table_t =
 template <typename algo_group_type>
 using ccl_selection_table_iter_t = typename ccl_selection_table_t<algo_group_type>::const_iterator;
 
-#define CCL_SELECTION_DECLARE_ALGO_SELECTOR_BASE() \
-    template <typename algo_group_type> \
-    struct ccl_algorithm_selector_base { \
-        ccl_selection_table_t<algo_group_type> main_table{}; \
-        ccl_selection_table_t<algo_group_type> fallback_table{}; \
-        ccl_selection_table_t<algo_group_type> scaleout_table{}; \
-        ccl_algorithm_selector_base(){}; \
-        void init(); \
-        void print() const; \
-        algo_group_type get(const ccl_selector_param& param) const; \
-        void insert(ccl_selection_table_t<algo_group_type>& table, \
-                    size_t left, \
-                    size_t right, \
-                    algo_group_type algo_id); \
-    };
+template <typename algo_group_type>
+struct ccl_algorithm_selector_base {
+    ccl_selection_table_t<algo_group_type> main_table;
+    ccl_selection_table_t<algo_group_type> fallback_table;
+    ccl_selection_table_t<algo_group_type> scaleout_table;
+    void init();
+    void print() const;
+    algo_group_type get(const ccl_selector_param& param) const;
+    static void insert(ccl_selection_table_t<algo_group_type>& table,
+                       size_t left,
+                       size_t right,
+                       algo_group_type algo_id);
+};
 
 #define CCL_SELECTION_DECLARE_ALGO_SELECTOR(coll_id, algo_group_type) \
     template <> \
@@ -89,15 +89,15 @@ using ccl_selection_table_iter_t = typename ccl_selection_table_t<algo_group_typ
         ccl_algorithm_selector(); \
     };
 
-CCL_SELECTION_DECLARE_ALGO_SELECTOR_BASE();
-
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_allgatherv, ccl_coll_allgatherv_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_allreduce, ccl_coll_allreduce_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_alltoall, ccl_coll_alltoall_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_alltoallv, ccl_coll_alltoallv_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_barrier, ccl_coll_barrier_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_bcast, ccl_coll_bcast_algo);
+CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_recv, ccl_coll_recv_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_reduce, ccl_coll_reduce_algo);
 CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_reduce_scatter, ccl_coll_reduce_scatter_algo);
+CCL_SELECTION_DECLARE_ALGO_SELECTOR(ccl_coll_send, ccl_coll_send_algo);
 
 #include "coll/selection/selector_impl.hpp"
diff --git a/src/coll/selection/selector_allgatherv.cpp b/src/coll/selection/selector_allgatherv.cpp
index 3f086564a..18f818fb0 100644
--- a/src/coll/selection/selector_allgatherv.cpp
+++ b/src/coll/selection/selector_allgatherv.cpp
@@ -89,7 +89,7 @@ CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_allgatherv_algo,
                                         size_t count =
                                             std::accumulate(param.recv_counts,
                                                             param.recv_counts + param.comm->size(),
-                                                            0);
+                                                            ccl::utils::initial_count_value);
                                         count /= param.comm->size();
                                         count;
                                     }),
diff --git a/src/coll/selection/selector_allreduce.cpp b/src/coll/selection/selector_allreduce.cpp
index de435e329..059808c0b 100644
--- a/src/coll/selection/selector_allreduce.cpp
+++ b/src/coll/selection/selector_allreduce.cpp
@@ -84,9 +84,6 @@ bool ccl_algorithm_selector_helper<ccl_coll_allreduce_algo>::can_use(
         can_use = false;
     else if (algo == ccl_coll_allreduce_topo && !ccl_can_use_topo_algo(param))
         can_use = false;
-    //  skip topo if count is 1
-    else if (algo == ccl_coll_allreduce_topo && param.count < 2)
-        can_use = false;
     else if (algo == ccl_coll_allreduce_2d && param.is_scaleout)
         // MLSL-1762: scale-up topo + scale-out 2d combination fails.
         // Algorithms are not compatible.
diff --git a/src/coll/selection/selector_recv.cpp b/src/coll/selection/selector_recv.cpp
new file mode 100644
index 000000000..8cf6cd8e8
--- /dev/null
+++ b/src/coll/selection/selector_recv.cpp
@@ -0,0 +1,78 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/selection/selection.hpp"
+
+template <>
+std::map<ccl_coll_recv_algo, std::string>
+    ccl_algorithm_selector_helper<ccl_coll_recv_algo>::algo_names = {
+        std::make_pair(ccl_coll_recv_direct, "direct"),
+        std::make_pair(ccl_coll_recv_offload, "offload"),
+#ifdef CCL_ENABLE_SYCL
+        std::make_pair(ccl_coll_recv_topo, "topo")
+#endif // CCL_ENABLE_SYCL
+    };
+
+ccl_algorithm_selector<ccl_coll_recv>::ccl_algorithm_selector() {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_recv_topo);
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_recv_direct);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (!use_pt2pt_offload_algo()) {
+        insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_recv_direct);
+    }
+    else {
+        insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_recv_offload);
+        if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
+            set_offload_pt2pt_mpi_env();
+        }
+    }
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_recv_direct);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    // recv currently does not support scale-out selection, but the table
+    // has to be defined, therefore duplicating main table
+    scaleout_table = main_table;
+}
+
+template <>
+bool ccl_algorithm_selector_helper<ccl_coll_recv_algo>::can_use(
+    ccl_coll_recv_algo algo,
+    const ccl_selector_param& param,
+    const ccl_selection_table_t<ccl_coll_recv_algo>& table) {
+    bool can_use = true;
+
+    ccl_coll_algo algo_param;
+    algo_param.recv = algo;
+    can_use = ccl_can_use_datatype(algo_param, param);
+
+    if (algo == ccl_coll_recv_topo && !ccl_can_use_topo_algo(param)) {
+        can_use = false;
+    }
+    else if (algo == ccl_coll_recv_offload && !use_pt2pt_offload_algo()) {
+        can_use = false;
+    }
+
+    return can_use;
+}
+
+CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_recv_algo,
+                                    ccl_coll_recv,
+                                    ccl::global_data::env().recv_algo_raw,
+                                    param.count,
+                                    ccl::global_data::env().recv_scaleout_algo_raw);
diff --git a/src/coll/selection/selector_send.cpp b/src/coll/selection/selector_send.cpp
new file mode 100644
index 000000000..39df8768f
--- /dev/null
+++ b/src/coll/selection/selector_send.cpp
@@ -0,0 +1,79 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "coll/selection/selection.hpp"
+
+template <>
+std::map<ccl_coll_send_algo, std::string>
+    ccl_algorithm_selector_helper<ccl_coll_send_algo>::algo_names = {
+        std::make_pair(ccl_coll_send_direct, "direct"),
+        std::make_pair(ccl_coll_send_offload, "offload"),
+#ifdef CCL_ENABLE_SYCL
+        std::make_pair(ccl_coll_send_topo, "topo")
+#endif // CCL_ENABLE_SYCL
+    };
+
+ccl_algorithm_selector<ccl_coll_send>::ccl_algorithm_selector() {
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_send_topo);
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    insert(main_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_send_direct);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (!use_pt2pt_offload_algo()) {
+        insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_send_direct);
+    }
+    else {
+        insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_send_offload);
+        if (ccl::global_data::env().atl_transport == ccl_atl_mpi) {
+            set_offload_pt2pt_mpi_env();
+        }
+    }
+#else // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    insert(fallback_table, 0, CCL_SELECTION_MAX_COLL_SIZE, ccl_coll_send_direct);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
+    // send currently does not support scale-out selection, but the table
+    // has to be defined, therefore duplicating main table
+    scaleout_table = main_table;
+}
+
+template <>
+bool ccl_algorithm_selector_helper<ccl_coll_send_algo>::can_use(
+    ccl_coll_send_algo algo,
+    const ccl_selector_param& param,
+    const ccl_selection_table_t<ccl_coll_send_algo>& table) {
+    bool can_use = true;
+
+    ccl_coll_algo algo_param;
+    algo_param.send = algo;
+    can_use = ccl_can_use_datatype(algo_param, param);
+
+    if (algo == ccl_coll_send_topo && !ccl_can_use_topo_algo(param)) {
+        can_use = false;
+    }
+    else if (algo == ccl_coll_send_offload && !use_pt2pt_offload_algo()) {
+        can_use = false;
+    }
+
+    return can_use;
+}
+
+CCL_SELECTION_DEFINE_HELPER_METHODS(ccl_coll_send_algo,
+                                    ccl_coll_send,
+                                    ccl::global_data::env().send_algo_raw,
+                                    param.count,
+                                    ccl::global_data::env().send_scaleout_algo_raw);
diff --git a/src/comm/atl_tag.cpp b/src/comm/atl_tag.cpp
index 739880d66..b8d365499 100644
--- a/src/comm/atl_tag.cpp
+++ b/src/comm/atl_tag.cpp
@@ -24,16 +24,39 @@ std::string ccl_atl_tag::to_string() const {
     return ss.str();
 }
 
-uint64_t ccl_atl_tag::create(int rank,
-                             ccl_comm_id_t comm_id,
-                             ccl_sched_id_t sched_id,
-                             ccl_op_id_t op_id) {
+template <typename Layout>
+ccl_sched_id_t ccl_atl_tag_impl<Layout>::get_max_sched_count() {
+    return Layout::max_sched_count;
+}
+
+template <typename Layout>
+ccl_sched_id_t ccl_atl_tag_impl<Layout>::get_pt2pt_sched_id() {
+    return Layout::pt2pt_sched_id;
+}
+
+template <typename Layout>
+ccl_sched_id_t ccl_atl_tag_impl<Layout>::get_pt2pt_ack_tag() {
+    return Layout::pt2pt_ack_tag;
+}
+
+template <typename Layout>
+std::tuple<ccl_sched_id_t, ccl_sched_id_t> ccl_atl_tag_impl<Layout>::get_pt2pt_sync_tags() {
+    auto ack_first = Layout::pt2pt_ack_first;
+    auto ack_second = Layout::pt2pt_ack_second;
+    return std::make_tuple(ack_first, ack_second);
+}
+
+template <typename Layout>
+uint64_t ccl_atl_tag_impl<Layout>::create(int rank,
+                                          ccl_comm_id_t comm_id,
+                                          ccl_sched_id_t sched_id,
+                                          ccl_op_id_t op_id) {
     uint64_t tag = 0;
 
-    tag |= (((uint64_t)op_id) << op_id_shift) & op_id_mask;
-    tag |= (((uint64_t)sched_id) << sched_id_shift) & sched_id_mask;
-    tag |= (((uint64_t)comm_id) << comm_id_shift) & comm_id_mask;
-    tag |= (((uint64_t)rank) << rank_shift) & rank_mask;
+    tag |= (static_cast<uint64_t>(op_id) << Layout::op_id_shift) & Layout::op_id_mask;
+    tag |= (static_cast<uint64_t>(sched_id) << Layout::sched_id_shift) & Layout::sched_id_mask;
+    tag |= (static_cast<uint64_t>(comm_id) << Layout::comm_id_shift) & Layout::comm_id_mask;
+    tag |= (static_cast<uint64_t>(rank) << Layout::rank_shift) & Layout::rank_mask;
 
     if (tag > max_tag)
         tag &= max_tag_mask;
@@ -67,3 +90,8 @@ uint64_t ccl_atl_tag::create(int rank,
 
     return tag;
 }
+
+// Explicit template instantiation to avoid linker errors
+template class ccl_atl_tag_impl<common_tag_layout>;
+template class ccl_atl_tag_impl<ofi_cxi_tag_layout>;
+template class ccl_atl_tag_impl<mpi_tag_layout>;
diff --git a/src/comm/atl_tag.hpp b/src/comm/atl_tag.hpp
index 4f6e56613..7cb5183e1 100644
--- a/src/comm/atl_tag.hpp
+++ b/src/comm/atl_tag.hpp
@@ -18,13 +18,115 @@
 #include "common/log/log.hpp"
 #include "common/utils/utils.hpp"
 
+#include <tuple>
+
 using ccl_op_id_t = uint8_t;
 using ccl_sched_id_t = uint16_t;
 using ccl_comm_id_t = uint16_t;
 
+// A list of observed distinct provider specific number of bits used for the tag
+enum tag_layout : unsigned int { mpich = 17, impi = 24, cxi = 48, common = 64 };
+
+// Point-to-point tag configuration.
+// Adjusted to the maximum ccl_sched_id_t value
+// that can be encoded into the tag
+template <unsigned int N = 16>
+struct pt2pt_tag_layout {
+    static_assert(N > 0 && N <= sizeof(ccl_sched_id_t) * 8,
+                  "the number of bits should not exceed the size of ccl_sched_id_t type");
+    // We have declared the tag using the data type ccl_sched_id_t,
+    // however, it can be equivalent to a N-bit unsigned integer. For pt2pt,
+    // we use the maximum value that can be represented with these N bits as the tag.
+    static constexpr ccl_sched_id_t pt2pt_sched_id = (1 << N) - 1;
+    static_assert(pt2pt_sched_id <= std::numeric_limits<ccl_sched_id_t>::max(),
+                  "pt2pt_sched_id should not exceed the max value of ccl_sched_id_t");
+    // these tags are reserved for pt2pt ack messages to align topo pt2pt operations
+    static constexpr ccl_sched_id_t pt2pt_ack_tag = pt2pt_sched_id - 1;
+    static constexpr ccl_sched_id_t pt2pt_ack_first = pt2pt_sched_id - 2;
+    static constexpr ccl_sched_id_t pt2pt_ack_second = pt2pt_sched_id - 3;
+    // maximum value of schedule id in scope of the current communicator
+    static constexpr ccl_sched_id_t max_sched_count = pt2pt_sched_id - 4;
+};
+
+/*
+ * Common 64-bit layout suitable for the most cases
+ */
+/**********************************************************************************
+ * common tag layout                                                              *
+ * ********************************************************************************
+ * 01234567 01234567 01234567 | 01234567 01234567 | 01234567 01234567  | 01234567 |
+ *                            |                   |                    |          |
+ *           rank             |      comm_id      |       sched_id     |   op_id  |
+ *********************************************************************************/
+struct common_tag_layout : pt2pt_tag_layout<16> {
+    static constexpr int op_id_shift = 0;
+    static constexpr int sched_id_shift = 8;
+    static constexpr int comm_id_shift = 24;
+    static constexpr int rank_shift = 40;
+
+    static constexpr uint64_t op_id_mask = 0x00000000000000FF;
+    static constexpr uint64_t sched_id_mask = 0x0000000000FFFF00;
+    static constexpr uint64_t comm_id_mask = 0x000000FFFF000000;
+    static constexpr uint64_t rank_mask = 0xFFFFFF0000000000;
+};
+
+/*
+ * CXI provider accepts 48-bits tag.
+ * With the common layout it means, that we have to cut the most
+ * significant bits in our tag for correctness. However, rank identifier
+ * could then hold only 8-bits, therefore 256 ranks could be encoded in the tag.
+ * If there is more then 256 ranks communicating with each other, tags
+ * may coalesce, leading to the wrong messages accepted in a place,
+ * which will be hard to track. 2^20 number of ranks should be sufficient,
+ * other bit fields were cut for reasonable values.
+ */
+/******************************************************************
+ * cxi tag layout                                                 *
+ * ****************************************************************
+ * 0123 01234567 01234567 | 01234567 0123 | 01234567 0123 | 0123  |
+ *                        |               |               |       |
+ *         rank           |    comm_id    |    sched_id   | op_id |
+ ******************************************************************/
+struct ofi_cxi_tag_layout : pt2pt_tag_layout<12> {
+    static constexpr int op_id_shift = 0;
+    static constexpr int sched_id_shift = 4;
+    static constexpr int comm_id_shift = 16;
+    static constexpr int rank_shift = 28;
+
+    static constexpr uint64_t op_id_mask = 0x000000000000000F;
+    static constexpr uint64_t sched_id_mask = 0x000000000000FFF0;
+    static constexpr uint64_t comm_id_mask = 0x000000000FFF0000;
+    static constexpr uint64_t rank_mask = 0x0000FFFFF0000000;
+};
+
+/*
+ * MPI standart requires the tag to be at most 32-bit integer number.
+ * However, also the tag should be no less then 16-bit value.
+ * However, since the rank and the comm and
+ * TODO: support MPICH and I_MPI layout separately.
+ */
+/************************************************
+ * mpi tag layout                               *
+ * **********************************************
+ * 01234567 | 01234567 | 01234567 0123 | 0123  |
+ *          |          |               |       |
+ *   rank   | comm_id  |   sched_id    | op_id |
+ ************************************************/
+struct mpi_tag_layout : pt2pt_tag_layout<12> {
+    static constexpr int op_id_shift = 0;
+    static constexpr int sched_id_shift = 4;
+    static constexpr int comm_id_shift = 16;
+    static constexpr int rank_shift = 24;
+
+    static constexpr uint64_t op_id_mask = 0x000000000000000F;
+    static constexpr uint64_t sched_id_mask = 0x000000000000FFF0;
+    static constexpr uint64_t comm_id_mask = 0x0000000000FF0000;
+    static constexpr uint64_t rank_mask = 0x00000000FF000000;
+};
+
 class ccl_atl_tag {
 public:
-    ccl_atl_tag(size_t tag_bits, size_t max_tag) : tag_bits(tag_bits), max_tag(max_tag) {
+    ccl_atl_tag(size_t tag_bits, size_t max_tag) : tag_bits{ tag_bits }, max_tag{ max_tag } {
         CCL_THROW_IF_NOT(tag_bits >= 32, "unexpected tag_bits ", tag_bits);
         CCL_ASSERT(sizeof(ccl_op_id_t) == 1);
         CCL_ASSERT(sizeof(ccl_sched_id_t) <= 2);
@@ -42,7 +144,7 @@ class ccl_atl_tag {
     ccl_atl_tag& operator=(const ccl_atl_tag& other) = delete;
     ccl_atl_tag& operator=(ccl_atl_tag&& other) = delete;
 
-    ~ccl_atl_tag() = default;
+    virtual ~ccl_atl_tag() = default;
 
     std::string to_string() const;
 
@@ -54,31 +156,35 @@ class ccl_atl_tag {
      * @param op_id local operation id, used as sub-schedule identifier
      * @return ATL communication tag
      */
-    uint64_t create(int rank,
-                    ccl_comm_id_t comm_id,
-                    ccl_sched_id_t sched_id,
-                    ccl_op_id_t op_id = 0);
+    virtual uint64_t create(int rank,
+                            ccl_comm_id_t comm_id,
+                            ccl_sched_id_t sched_id,
+                            ccl_op_id_t op_id = 0) = 0;
 
-private:
-    /**********************************************************************************
-     *  atl tag layout                                                                *
-     * ********************************************************************************
-     * 01234567 01234567 01234567 | 01234567 01234567 | 01234567 01234567  | 01234567 |
-     *                            |                   |                    |          |
-     *           rank             |      comm_id      |       sched_id     |   op_id  |
-     *********************************************************************************/
+    // Point-to-point config data accessors
+    virtual ccl_sched_id_t get_max_sched_count() = 0;
+    virtual ccl_sched_id_t get_pt2pt_sched_id() = 0;
+    virtual ccl_sched_id_t get_pt2pt_ack_tag() = 0;
+    virtual std::tuple<ccl_sched_id_t, ccl_sched_id_t> get_pt2pt_sync_tags() = 0;
 
+protected:
     size_t tag_bits;
     size_t max_tag;
     size_t max_tag_mask;
+};
 
-    const int op_id_shift = 0;
-    const int sched_id_shift = 8;
-    const int comm_id_shift = 24;
-    const int rank_shift = 40;
+template <typename Layout>
+class ccl_atl_tag_impl : public ccl_atl_tag {
+public:
+    ccl_atl_tag_impl(size_t tag_bits, size_t max_tag) : ccl_atl_tag{ tag_bits, max_tag } {}
+
+    uint64_t create(int rank,
+                    ccl_comm_id_t comm_id,
+                    ccl_sched_id_t sched_id,
+                    ccl_op_id_t op_id = 0);
 
-    const uint64_t op_id_mask = 0x00000000000000FF;
-    const uint64_t sched_id_mask = 0x0000000000FFFF00;
-    const uint64_t comm_id_mask = 0x000000FFFF000000;
-    const uint64_t rank_mask = 0xFFFFFF0000000000;
+    ccl_sched_id_t get_max_sched_count();
+    ccl_sched_id_t get_pt2pt_sched_id();
+    ccl_sched_id_t get_pt2pt_ack_tag();
+    std::tuple<ccl_sched_id_t, ccl_sched_id_t> get_pt2pt_sync_tags();
 };
diff --git a/src/comm/comm.cpp b/src/comm/comm.cpp
index 0a45e8d2b..d1f737fdb 100644
--- a/src/comm/comm.cpp
+++ b/src/comm/comm.cpp
@@ -114,7 +114,7 @@ void ccl_comm::init(int comm_id,
     comm_rank = atl_comm->get_rank();
     comm_size = atl_comm->get_size();
 
-    next_sched_id_internal = ccl_comm::max_sched_count / 2;
+    next_sched_id_internal = atl_comm->tag_creator->get_max_sched_count() / 2;
     next_sched_id_external = 0;
 
     if (comm_rank >= comm_size || comm_size <= 0) {
@@ -152,7 +152,7 @@ ccl_comm::ccl_comm(int comm_id,
                    std::shared_ptr<atl_base_comm> atl_comm,
                    bool share_resources,
                    bool is_sub_communicator) {
-    init(comm_id, atl_comm, share_resources, is_sub_communicator);
+    init(comm_id, std::move(atl_comm), share_resources, is_sub_communicator);
 }
 
 ccl_comm::ccl_comm(std::shared_ptr<atl_base_comm> atl_comm,
@@ -163,14 +163,15 @@ ccl_comm::ccl_comm(std::shared_ptr<atl_base_comm> atl_comm,
 ccl_comm::ccl_comm(device_t device, context_t context, std::shared_ptr<atl_base_comm> atl_comm)
         : device_ptr(std::make_shared<ccl::device>(device)),
           context_ptr(std::make_shared<ccl::context>(context)) {
-    init(atl_comm->create_comm_id(), atl_comm);
+    int id = atl_comm->create_comm_id();
+    init(id, std::move(atl_comm));
 }
 
 ccl_comm::ccl_comm(int size, int rank, ccl::shared_ptr_class<ikvs_wrapper> kvs)
-        : ccl_comm(atl_comm_manager::create(size, { rank }, kvs)) {}
+        : ccl_comm(atl_comm_manager::create(size, { rank }, std::move(kvs))) {}
 
 ccl_comm::ccl_comm(int size, ccl::shared_ptr_class<ikvs_wrapper> kvs)
-        : ccl_comm(atl_comm_manager::create(size, { 0 }, kvs)) {}
+        : ccl_comm(atl_comm_manager::create(size, { 0 }, std::move(kvs))) {}
 
 ccl_comm::ccl_comm() : ccl_comm(atl_comm_manager::create()) {}
 
@@ -316,19 +317,43 @@ int ccl_comm::get_rank_from_global(int global_rank) const {
     return rank;
 }
 
-ccl_sched_id_t ccl_comm::get_sched_id(bool use_internal_space) {
+bool ccl_comm::try_get_rank_from_global(int global_rank) const {
+    bool ret = false;
+    if (local2global_map.empty()) {
+        // global comm and its copies do not have entries in the map
+        return ret;
+    }
+
+    for (size_t i = 0; i < local2global_map.size(); ++i) {
+        if (local2global_map[i] == global_rank) {
+            return true;
+        }
+    }
+
+    return ret;
+}
+
+ccl_sched_id_t ccl_comm::get_sched_id(bool use_internal_space, bool is_pt2pt) {
+    std::shared_ptr<atl_base_comm> atl_comm = get_atl_comm();
     ccl_sched_id_t& next_sched_id =
         (use_internal_space) ? next_sched_id_internal : next_sched_id_external;
 
+    ccl_sched_id_t max_sched_count = atl_comm->tag_creator->get_max_sched_count();
+
     ccl_sched_id_t first_sched_id =
-        (use_internal_space) ? static_cast<ccl_sched_id_t>(0) : ccl_comm::max_sched_count / 2;
+        (use_internal_space) ? static_cast<ccl_sched_id_t>(0) : max_sched_count / 2;
 
-    ccl_sched_id_t max_sched_id =
-        (use_internal_space) ? ccl_comm::max_sched_count / 2 : ccl_comm::max_sched_count;
+    ccl_sched_id_t max_sched_id = (use_internal_space) ? max_sched_count / 2 : max_sched_count;
 
     ccl_sched_id_t id = next_sched_id;
 
-    ++next_sched_id;
+    // is_pt2pt flag is required in the case
+    // to avoid when send-recv communication between ranks
+    // less comm_size, the ++next_sched_id op is skipped if
+    // is_pt2pt = true
+    if (!is_pt2pt) {
+        ++next_sched_id;
+    }
 
     if (next_sched_id == max_sched_id) {
         /* wrap the sched numbers around to the start */
diff --git a/src/comm/comm.hpp b/src/comm/comm.hpp
index 9e0fd7b51..a4ba1b512 100644
--- a/src/comm/comm.hpp
+++ b/src/comm/comm.hpp
@@ -142,16 +142,6 @@ class alignas(CACHELINE_SIZE) ccl_comm : public ccl::comm_interface {
 public:
     static constexpr int invalid_rank = -1;
 
-    // maximum value of schedule id in scope of the current communicator
-    static constexpr ccl_sched_id_t max_sched_count =
-        std::numeric_limits<ccl_sched_id_t>::max() - 1;
-    // We have declared the tag using the data type ccl_sched_id_t,
-    // which is equivalent to a 16-bit unsigned integer. For pt2pt,
-    // we use the maximum value that can be represented with these 16
-    // bits as the tag. This approach works effectively because the
-    // schedule_id in the tag also uses 16 bits
-    static constexpr ccl_sched_id_t pt2pt_sched_id = std::numeric_limits<ccl_sched_id_t>::max();
-
     void init(int comm_id,
               std::shared_ptr<atl_base_comm> atl_comm,
               bool share_resources = false,
@@ -223,7 +213,8 @@ class alignas(CACHELINE_SIZE) ccl_comm : public ccl::comm_interface {
     int get_global_rank(int rank) const;
 
     int get_rank_from_global(int global_rank) const;
-    ccl_sched_id_t get_sched_id(bool use_internal_space);
+    bool try_get_rank_from_global(int global_rank) const;
+    ccl_sched_id_t get_sched_id(bool use_internal_space, bool is_pt2pt);
 
     device_ptr_t get_device() const override {
         return device_ptr;
diff --git a/src/comm/comm_selector.cpp b/src/comm/comm_selector.cpp
index ed8f5f947..bdf4b6a9b 100644
--- a/src/comm/comm_selector.cpp
+++ b/src/comm/comm_selector.cpp
@@ -58,7 +58,7 @@ comm_interface_ptr comm_selector::create_comm_impl(const size_t size,
     CCL_THROW_IF_NOT(ccl::global_data::env().backend == backend_mode::native,
                      "host communicator is only supported for native backend");
 
-    return comm_interface_ptr(ccl_comm::create(size, rank, kvs));
+    return comm_interface_ptr(ccl_comm::create(size, rank, std::move(kvs)));
 }
 
 comm_interface_ptr comm_selector::create_comm_impl(const size_t size,
diff --git a/src/common/api_wrapper/ofi_api_wrapper.cpp b/src/common/api_wrapper/ofi_api_wrapper.cpp
index 3cd6b7f2c..4b2890ac5 100644
--- a/src/common/api_wrapper/ofi_api_wrapper.cpp
+++ b/src/common/api_wrapper/ofi_api_wrapper.cpp
@@ -13,6 +13,8 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include <sys/stat.h>
+
 #include "common/api_wrapper/api_wrapper.hpp"
 #include "common/api_wrapper/ofi_api_wrapper.hpp"
 
@@ -21,21 +23,53 @@ namespace ccl {
 lib_info_t ofi_lib_info;
 ofi_lib_ops_t ofi_lib_ops;
 
+std::string get_ofi_lib_path() {
+    // lib_path specifies the name and full path to the OFI library -
+    // it should be an absolute and validated path pointing to the
+    // desired libfabric library
+
+    // the order of searching for libfabric is:
+    // * CCL_OFI_LIBRARY_PATH (ofi_lib_path env)
+    // * I_MPI_OFI_LIBRARY
+    // * I_MPI_ROOT/opt/mpi/libfabric/lib
+    // * LD_LIBRARY_PATH
+
+    auto ofi_lib_path = ccl::global_data::env().ofi_lib_path;
+    if (!ofi_lib_path.empty()) {
+        LOG_DEBUG("OFI lib path (CCL_OFI_LIBRARY_PATH): ", ofi_lib_path);
+    }
+    else {
+        char* mpi_ofi_path = getenv("I_MPI_OFI_LIBRARY");
+        if (mpi_ofi_path) {
+            ofi_lib_path = std::string(mpi_ofi_path);
+            LOG_DEBUG("OFI lib path (I_MPI_OFI_LIBRARY): ", ofi_lib_path);
+        }
+        else {
+            char* mpi_root = getenv("I_MPI_ROOT");
+            std::string mpi_root_ofi_lib_path =
+                mpi_root == NULL ? std::string() : std::string(mpi_root);
+            mpi_root_ofi_lib_path += "/opt/mpi/libfabric/lib/libfabric.so";
+            struct stat buffer {};
+            if (mpi_root && stat(mpi_root_ofi_lib_path.c_str(), &buffer) == 0) {
+                ofi_lib_path = std::move(mpi_root_ofi_lib_path);
+                LOG_DEBUG("OFI lib path (MPI_ROOT/opt/mpi/libfabric/lib/): ", ofi_lib_path);
+            }
+            else {
+                ofi_lib_path = "libfabric.so";
+                LOG_DEBUG("OFI lib path (LD_LIBRARY_PATH): ", ofi_lib_path);
+            }
+        }
+    }
+
+    return ofi_lib_path;
+}
+
 bool ofi_api_init() {
     bool ret = true;
 
     ofi_lib_info.ops = &ofi_lib_ops;
     ofi_lib_info.fn_names = ofi_fn_names;
-
-    // lib_path specifies the name and full path to the OFI library
-    // it should be absolute and validated path
-    // pointing to desired libfabric library
-    ofi_lib_info.path = ccl::global_data::env().ofi_lib_path;
-
-    if (ofi_lib_info.path.empty()) {
-        ofi_lib_info.path = "libfabric.so.1";
-    }
-    LOG_DEBUG("OFI lib path: ", ofi_lib_info.path);
+    ofi_lib_info.path = get_ofi_lib_path();
 
     load_library(ofi_lib_info);
     if (!ofi_lib_info.handle)
diff --git a/src/common/api_wrapper/ofi_api_wrapper.hpp b/src/common/api_wrapper/ofi_api_wrapper.hpp
index 9a4da26ce..fa04e298a 100644
--- a/src/common/api_wrapper/ofi_api_wrapper.hpp
+++ b/src/common/api_wrapper/ofi_api_wrapper.hpp
@@ -50,5 +50,6 @@ extern ccl::ofi_lib_ops_t ofi_lib_ops;
 
 bool ofi_api_init();
 void ofi_api_fini();
+std::string get_ofi_lib_path();
 
 } //namespace ccl
diff --git a/src/common/api_wrapper/ze_api_wrapper.hpp b/src/common/api_wrapper/ze_api_wrapper.hpp
index 3e0759f96..ccec6bada 100644
--- a/src/common/api_wrapper/ze_api_wrapper.hpp
+++ b/src/common/api_wrapper/ze_api_wrapper.hpp
@@ -59,10 +59,12 @@ typedef struct ze_lib_ops {
     decltype(zeCommandListAppendWaitOnEvents) *zeCommandListAppendWaitOnEvents;
     decltype(zeCommandListAppendSignalEvent) *zeCommandListAppendSignalEvent;
     decltype(zeCommandListAppendBarrier) *zeCommandListAppendBarrier;
+    decltype(zeCommandListAppendMemoryRangesBarrier) *zeCommandListAppendMemoryRangesBarrier;
     decltype(zeCommandListClose) *zeCommandListClose;
     decltype(zeCommandListReset) *zeCommandListReset;
     decltype(zeCommandListDestroy) *zeCommandListDestroy;
     decltype(zeContextCreate) *zeContextCreate;
+    decltype(zeContextSystemBarrier) *zeContextSystemBarrier;
     decltype(zeContextDestroy) *zeContextDestroy;
     decltype(zeEventPoolCreate) *zeEventPoolCreate;
     decltype(zeEventCreate) *zeEventCreate;
@@ -133,10 +135,12 @@ static std::vector<std::string> ze_fn_names = {
     "zeCommandListAppendWaitOnEvents",
     "zeCommandListAppendSignalEvent",
     "zeCommandListAppendBarrier",
+    "zeCommandListAppendMemoryRangesBarrier",
     "zeCommandListClose",
     "zeCommandListReset",
     "zeCommandListDestroy",
     "zeContextCreate",
+    "zeContextSystemBarrier",
     "zeContextDestroy",
     "zeEventPoolCreate",
     "zeEventCreate",
@@ -209,10 +213,13 @@ extern ccl::ze_lib_ops_t ze_lib_ops;
 #define zeCommandListAppendWaitOnEvents   ccl::ze_lib_ops.zeCommandListAppendWaitOnEvents
 #define zeCommandListAppendSignalEvent    ccl::ze_lib_ops.zeCommandListAppendSignalEvent
 #define zeCommandListAppendBarrier        ccl::ze_lib_ops.zeCommandListAppendBarrier
+#define zeCommandListAppendMemoryRangesBarrier \
+    ccl::ze_lib_ops.zeCommandListAppendMemoryRangesBarrier
 #define zeCommandListClose                ccl::ze_lib_ops.zeCommandListClose
 #define zeCommandListReset                ccl::ze_lib_ops.zeCommandListReset
 #define zeCommandListDestroy              ccl::ze_lib_ops.zeCommandListDestroy
 #define zeContextCreate                   ccl::ze_lib_ops.zeContextCreate
+#define zeContextSystemBarrier            ccl::ze_lib_ops.zeContextSystemBarrier
 #define zeContextDestroy                  ccl::ze_lib_ops.zeContextDestroy
 #define zeEventPoolCreate                 ccl::ze_lib_ops.zeEventPoolCreate
 #define zeEventCreate                     ccl::ze_lib_ops.zeEventCreate
diff --git a/src/common/context/context.cpp b/src/common/context/context.cpp
index 85a4b4f82..2fce5e2b0 100644
--- a/src/common/context/context.cpp
+++ b/src/common/context/context.cpp
@@ -38,7 +38,7 @@ void ccl_context_impl::build_from_params() {
         throw ccl::exception("error");
     }
 #ifdef CCL_ENABLE_SYCL
-    /* TODO unavailbale??
+    /* TODO unavailable??
     event_native_t event_candidate{native_context};
     std::swap(event_candidate, native_event); //TODO USE attributes fro sycl queue construction
     */
diff --git a/src/common/datatype/datatype.cpp b/src/common/datatype/datatype.cpp
index d9d8484b1..8433276aa 100644
--- a/src/common/datatype/datatype.cpp
+++ b/src/common/datatype/datatype.cpp
@@ -182,23 +182,41 @@ void ccl_datatype_storage::free(ccl::datatype idx) {
 
 const ccl_datatype& ccl_datatype_storage::get(ccl::datatype idx) const {
     if (is_predefined_datatype(idx)) {
-        return predefined_table.find(idx)->second.first;
+        auto it = predefined_table.find(idx);
+        if (it != predefined_table.end()) {
+            return it->second.first;
+        }
+        LOG_WARN("unexpected idx for ccl_datatype_storage::get(), returning default datatype");
+        return default_datatype;
     }
-    else {
-        std::lock_guard<ccl_datatype_lock_t> lock{ guard };
-        return custom_table.find(idx)->second.first;
+    std::lock_guard<ccl_datatype_lock_t> lock{ guard };
+    auto it = custom_table.find(idx);
+    if (it != custom_table.end()) {
+        return it->second.first;
     }
+    LOG_WARN("unexpected idx for ccl_datatype_storage::get(), returning default datatype");
+    return default_datatype;
 }
 
 const std::string& ccl_datatype_storage::name(const ccl_datatype& dtype) const {
     ccl::datatype idx = dtype.idx();
     if (is_predefined_datatype(idx)) {
-        return predefined_table.find(idx)->second.second;
+        auto it = predefined_table.find(idx);
+        if (it != predefined_table.end()) {
+            return it->second.second;
+        }
+        LOG_WARN(
+            "unexpected datatype for ccl_datatype_storage::name(), returning \"undefined\" type name");
+        return default_type_str;
     }
-    else {
-        std::lock_guard<ccl_datatype_lock_t> lock{ guard };
-        return custom_table.find(idx)->second.second;
+    std::lock_guard<ccl_datatype_lock_t> lock{ guard };
+    auto it = custom_table.find(idx);
+    if (it != custom_table.end()) {
+        return it->second.second;
     }
+    LOG_WARN(
+        "unexpected datatype for ccl_datatype_storage::name(), returning \"undefined\" type name");
+    return default_type_str;
 }
 
 const std::string& ccl_datatype_storage::name(ccl::datatype idx) const {
diff --git a/src/common/datatype/datatype.hpp b/src/common/datatype/datatype.hpp
index 32610e490..929b265f6 100644
--- a/src/common/datatype/datatype.hpp
+++ b/src/common/datatype/datatype.hpp
@@ -107,6 +107,9 @@ class ccl_datatype_storage {
 
     ccl_datatype_table_t predefined_table;
     ccl_datatype_table_t custom_table;
+
+    ccl_datatype default_datatype{};
+    std::string default_type_str = "undefined";
 };
 
 namespace ccl {
diff --git a/src/common/device/device.cpp b/src/common/device/device.cpp
index 670466f7e..66e83655e 100644
--- a/src/common/device/device.cpp
+++ b/src/common/device/device.cpp
@@ -38,7 +38,7 @@ void ccl_device_impl::build_from_params() {
         throw ccl::exception("error");
     }
 #ifdef CCL_ENABLE_SYCL
-    /* TODO unavailbale??
+    /* TODO unavailable??
     event_native_t event_candidate{native_context};
     std::swap(event_candidate, native_event); //TODO USE attributes fro sycl queue construction
     */
diff --git a/src/common/env/env.cpp b/src/common/env/env.cpp
index d58aaecb1..dee07cea2 100644
--- a/src/common/env/env.cpp
+++ b/src/common/env/env.cpp
@@ -149,6 +149,13 @@ env_data::env_data()
               0), // pipelined monolithic kernel for xelink + mdfi transfer
           alltoallv_monolithic_kernel(1),
           alltoallv_monolithic_read_kernel(1),
+
+          // TODO: not used values yet, add log_info and setting them
+          // once they are needed
+          allgatherv_pipe_chunk_count(0),
+          allreduce_pipe_chunk_count(0),
+          reduce_scatter_pipe_chunk_count(0),
+          reduce_pipe_chunk_count(0),
 #endif // CCL_ENABLE_SYCL
 
           allreduce_nreduce_buffering(0),
@@ -200,9 +207,15 @@ env_data::env_data()
           enable_sycl_output_event(1),
           use_hmem(1),
 
+          sync_barrier(1),
+
           enable_ze_barrier(0),
           enable_ze_bidir_algo(1),
           enable_ze_cache(1),
+          ze_device_cache_evict_smallest(1),
+          ze_device_cache_upper_limit(800 * 1024L * 1024L),
+          ze_device_cache_num_blocks_in_chunk(1),
+          ze_device_cache_policy(ccl::ze::device_cache_policy_mode::chunk),
 
           // Note: env. vars are required when
           // functionality is completed to support bypass/cache
@@ -216,7 +229,8 @@ env_data::env_data()
           enable_ze_single_list(1),
           disable_ze_family_check(0),
           disable_ze_port_check(0),
-          ze_disable_oversubscription_check(0),
+          ze_enable_oversubscription_fallback(1),
+          ze_enable_oversubscription_throw(1),
           ze_serialize_mode(0),
           ze_copy_engine(ccl::ze::copy_engine_mode::link),
           ze_h2d_copy_engine(ccl::ze::h2d_copy_engine_mode::none),
@@ -226,7 +240,6 @@ env_data::env_data()
           enable_ze_list_dump(0),
           ze_queue_index_offset(0),
           ze_close_ipc_wa(0),
-          enable_ze_cmd_bypass(1),
           ze_lib_path(),
           ze_enable(1),
           ze_fini_wa(0),
@@ -242,6 +255,8 @@ env_data::env_data()
 #else // ZE_PCI_PROPERTIES_EXT_NAME
           ze_drm_bdf_support(0),
 #endif // ZE_PCI_PROPERTIES_EXT_NAME
+          ze_pt2pt_read(1),
+          type2_mode(type2_tune_mode::undetected),
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
@@ -322,14 +337,18 @@ void env_data::parse() {
     env_2_enum(CCL_MNIC_OFFSET, mnic_offset_names, mnic_offset);
 
     env_2_type(CCL_ALGO_FALLBACK, enable_algo_fallback);
+    // main algorithm selection
     env_2_type(CCL_ALLGATHERV, allgatherv_algo_raw);
     env_2_type(CCL_ALLREDUCE, allreduce_algo_raw);
     env_2_type(CCL_ALLTOALL, alltoall_algo_raw);
     env_2_type(CCL_ALLTOALLV, alltoallv_algo_raw);
     env_2_type(CCL_BARRIER, barrier_algo_raw);
     env_2_type(CCL_BCAST, bcast_algo_raw);
+    env_2_type(CCL_RECV, recv_algo_raw);
     env_2_type(CCL_REDUCE, reduce_algo_raw);
     env_2_type(CCL_REDUCE_SCATTER, reduce_scatter_algo_raw);
+    env_2_type(CCL_SEND, send_algo_raw);
+    // scale-out selection part
     env_2_type(CCL_ALLGATHERV_SCALEOUT, allgatherv_scaleout_algo_raw);
     env_2_type(CCL_ALLREDUCE_SCALEOUT, allreduce_scaleout_algo_raw);
     env_2_type(CCL_ALLTOALL_SCALEOUT, alltoall_scaleout_algo_raw);
@@ -405,6 +424,11 @@ void env_data::parse() {
     env_2_type(CCL_ALLGATHERV_MONOLITHIC_PIPELINE_KERNEL, allgatherv_monolithic_pipeline_kernel);
     env_2_type(CCL_ALLTOALLV_MONOLITHIC_KERNEL, alltoallv_monolithic_kernel);
     env_2_type(CCL_ALLTOALLV_MONOLITHIC_READ_KERNEL, alltoallv_monolithic_read_kernel);
+
+    env_2_type(CCL_ALLGATHERV_PIPE_CHUNK_COUNT, allgatherv_pipe_chunk_count);
+    env_2_type(CCL_ALLREDUCE_PIPE_CHUNK_COUNT, allreduce_pipe_chunk_count);
+    env_2_type(CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT, reduce_scatter_pipe_chunk_count);
+    env_2_type(CCL_REDUCE_PIPE_CHUNK_COUNT, reduce_pipe_chunk_count);
 #endif // CCL_ENABLE_SYCL
 
     env_2_type(CCL_ALLREDUCE_NREDUCE_BUFFERING, allreduce_nreduce_buffering);
@@ -447,11 +471,16 @@ void env_data::parse() {
     if (kernel_path.empty()) {
         std::string ccl_root;
         char* ccl_root_env_value = getenv("CCL_ROOT");
+        char* oneapi_root_env_value = getenv("ONEAPI_ROOT");
         if (ccl_root_env_value) {
             ccl_root = ccl_root_env_value;
         }
-        CCL_THROW_IF_NOT(!ccl_root.empty(), "incorrect comm kernels path, CCL_ROOT not found!");
-        kernel_path = ccl_root + "/lib/kernels/";
+        else if (oneapi_root_env_value) {
+            ccl_root = oneapi_root_env_value;
+        }
+        CCL_THROW_IF_NOT(!ccl_root.empty(),
+                         "incorrect comm kernels path, neither CCL_ROOT nor ONEAPI_ROOT found!");
+        kernel_path = ccl_root + "/lib/ccl/kernels/";
     }
 
     env_2_type(CCL_KERNEL_DEBUG, kernel_debug);
@@ -468,9 +497,15 @@ void env_data::parse() {
     env_2_type(CCL_SYCL_OUTPUT_EVENT, enable_sycl_output_event);
     env_2_type(CCL_USE_HMEM, use_hmem);
 
+    env_2_type(CCL_BARRIER_SYNC, sync_barrier);
+
     env_2_type(CCL_ZE_BARRIER, enable_ze_barrier);
     env_2_type(CCL_ZE_BIDIR_ALGO, enable_ze_bidir_algo);
     env_2_type(CCL_ZE_CACHE, enable_ze_cache);
+    env_2_type(CCL_ZE_DEVICE_CACHE_EVICT_SMALLEST, ze_device_cache_evict_smallest);
+    env_2_type(CCL_ZE_DEVICE_CACHE_UPPER_LIMIT, ze_device_cache_upper_limit);
+    env_2_type(CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK, ze_device_cache_num_blocks_in_chunk);
+    env_2_enum(CCL_ZE_DEVICE_CACHE_POLICY, ccl::ze::device_cache_policy_names, ze_device_cache_policy);
     env_2_type(CCL_ZE_CACHE_OPEN_IPC_HANDLES, enable_ze_cache_open_ipc_handles);
     env_2_type(CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD, ze_cache_open_ipc_handles_threshold);
     if (enable_ze_cache == 0) {
@@ -487,7 +522,8 @@ void env_data::parse() {
     env_2_type(CCL_ZE_SINGLE_LIST, enable_ze_single_list);
     env_2_type(CCL_ZE_DISABLE_FAMILY_CHECK, disable_ze_family_check);
     env_2_type(CCL_ZE_DISABLE_PORT_CHECK, disable_ze_port_check);
-    env_2_type(CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK, ze_disable_oversubscription_check);
+    env_2_type(CCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK, ze_enable_oversubscription_fallback);
+    env_2_type(CCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW, ze_enable_oversubscription_throw);
     env_2_type(CCL_ZE_SERIALIZE, ze_serialize_mode);
     env_2_enum(CCL_ZE_COPY_ENGINE, ccl::ze::copy_engine_names, ze_copy_engine);
     env_2_enum(CCL_ZE_H2D_COPY_ENGINE, ccl::ze::h2d_copy_engine_names, ze_h2d_copy_engine);
@@ -522,6 +558,8 @@ void env_data::parse() {
     env_2_type(CCL_ZE_AUTO_TUNE_PORTS, enable_ze_auto_tune_ports);
     env_2_enum(CCL_ZE_IPC_EXCHANGE, ze::ipc_exchange_names, ze_ipc_exchange);
     env_2_type(CCL_ZE_DRM_BDF_SUPPORT, ze_drm_bdf_support);
+    env_2_type(CCL_ZE_PT2PT_READ, ze_pt2pt_read);
+    env_2_enum(CCL_ZE_TYPE2_TUNE_PORTS, type2_tune_mode_names, type2_mode);
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
@@ -650,12 +688,14 @@ void env_data::print(int rank) {
              (barrier_algo_raw.length()) ? barrier_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(
         CCL_BCAST, ": ", (bcast_algo_raw.length()) ? bcast_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_RECV, ": ", (recv_algo_raw.length()) ? recv_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(
         CCL_REDUCE, ": ", (reduce_algo_raw.length()) ? reduce_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(
         CCL_REDUCE_SCATTER,
         ": ",
         (reduce_scatter_algo_raw.length()) ? reduce_scatter_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
+    LOG_INFO(CCL_SEND, ": ", (send_algo_raw.length()) ? send_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
     LOG_INFO(CCL_ALLGATHERV,
              ": ",
              (allgatherv_algo_raw.length()) ? allgatherv_algo_raw : CCL_ENV_STR_NOT_SPECIFIED);
@@ -730,6 +770,11 @@ void env_data::print(int rank) {
         CCL_ALLGATHERV_MONOLITHIC_PIPELINE_KERNEL, ": ", allgatherv_monolithic_pipeline_kernel);
     LOG_INFO(CCL_ALLTOALLV_MONOLITHIC_KERNEL, ": ", alltoallv_monolithic_kernel);
     LOG_INFO(CCL_ALLTOALLV_MONOLITHIC_READ_KERNEL, ": ", alltoallv_monolithic_read_kernel);
+
+    LOG_INFO(CCL_ALLGATHERV_PIPE_CHUNK_COUNT, ": ", allgatherv_pipe_chunk_count);
+    LOG_INFO(CCL_ALLREDUCE_PIPE_CHUNK_COUNT, ": ", allreduce_pipe_chunk_count);
+    LOG_INFO(CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT, ": ", reduce_scatter_pipe_chunk_count);
+    LOG_INFO(CCL_REDUCE_PIPE_CHUNK_COUNT, ": ", reduce_pipe_chunk_count);
 #endif // CCL_ENABLE_SYCL
 
     LOG_INFO(CCL_ALLREDUCE_NREDUCE_BUFFERING, ": ", allreduce_nreduce_buffering);
@@ -797,18 +842,24 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_KERNEL_CLOSE_FD_WA, ": ", enable_close_fd_wa);
 
     LOG_INFO(CCL_SYCL_OUTPUT_EVENT, ": ", enable_sycl_output_event);
+    LOG_INFO(CCL_BARRIER_SYNC, ": ", sync_barrier);
     LOG_INFO(CCL_USE_HMEM, ": ", use_hmem);
 
     LOG_INFO(CCL_ZE_BARRIER, ": ", enable_ze_barrier);
     LOG_INFO(CCL_ZE_BIDIR_ALGO, ": ", enable_ze_bidir_algo);
     LOG_INFO(CCL_ZE_CACHE, ": ", enable_ze_cache);
+    LOG_INFO(CCL_ZE_DEVICE_CACHE_EVICT_SMALLEST, ": ", ze_device_cache_evict_smallest);
+    LOG_INFO(CCL_ZE_DEVICE_CACHE_UPPER_LIMIT, ": ", ze_device_cache_upper_limit);
+    LOG_INFO(CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK, ": ", ze_device_cache_num_blocks_in_chunk);
+    LOG_INFO(CCL_ZE_DEVICE_CACHE_POLICY, ": ", str_by_enum(ccl::ze::device_cache_policy_names, ze_device_cache_policy));
     LOG_INFO(CCL_ZE_CACHE_OPEN_IPC_HANDLES, ": ", enable_ze_cache_open_ipc_handles);
     LOG_INFO(CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD, ": ", ze_cache_open_ipc_handles_threshold);
     LOG_INFO(CCL_ZE_CACHE_GET_IPC_HANDLES, ": ", enable_ze_cache_get_ipc_handles);
     LOG_INFO(CCL_ZE_SINGLE_LIST, ": ", enable_ze_single_list);
     LOG_INFO(CCL_ZE_DISABLE_FAMILY_CHECK, ": ", disable_ze_family_check);
     LOG_INFO(CCL_ZE_DISABLE_PORT_CHECK, ": ", disable_ze_port_check);
-    LOG_INFO(CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK, ": ", ze_disable_oversubscription_check);
+    LOG_INFO(CCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK, ": ", ze_enable_oversubscription_fallback);
+    LOG_INFO(CCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW, ": ", ze_enable_oversubscription_throw);
     LOG_INFO(CCL_ZE_SERIALIZE, ": ", ze_serialize_mode);
     LOG_INFO(CCL_ZE_COPY_ENGINE, ": ", str_by_enum(ccl::ze::copy_engine_names, ze_copy_engine));
     LOG_INFO(CCL_ZE_H2D_COPY_ENGINE,
@@ -837,6 +888,8 @@ void env_data::print(int rank) {
     LOG_INFO(CCL_ZE_AUTO_TUNE_PORTS, ": ", enable_ze_auto_tune_ports);
     LOG_INFO(CCL_ZE_IPC_EXCHANGE, ": ", str_by_enum(ze::ipc_exchange_names, ze_ipc_exchange));
     LOG_INFO(CCL_ZE_DRM_BDF_SUPPORT, ": ", ze_drm_bdf_support);
+    LOG_INFO(CCL_ZE_PT2PT_READ, ": ", ze_pt2pt_read);
+    LOG_INFO(CCL_ZE_TYPE2_TUNE_PORTS, ": ", str_by_enum(type2_tune_mode_names, type2_mode));
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
@@ -1020,6 +1073,7 @@ int env_data::env_2_worker_affinity(int local_proc_idx, int local_proc_count) {
 
     size_t idx;
     char* env_to_parse = getenv(CCL_WORKER_AFFINITY);
+    long int temp_system_core_count;
     size_t system_core_count;
     size_t affinity_size = local_proc_count * worker_count;
 
@@ -1031,7 +1085,12 @@ int env_data::env_2_worker_affinity(int local_proc_idx, int local_proc_count) {
         }
         else {
             /* generate auto affinity as last N cores */
-            system_core_count = sysconf(_SC_NPROCESSORS_ONLN);
+            temp_system_core_count = sysconf(_SC_NPROCESSORS_ONLN);
+            // throw an error on sysconf failure (-1) or if core_count is invalid
+            CCL_THROW_IF_NOT(temp_system_core_count > 0,
+                             "system_core_count is incorrect: ",
+                             temp_system_core_count);
+            system_core_count = static_cast<size_t>(temp_system_core_count);
             for (idx = 0; idx < affinity_size; idx++) {
                 if (idx < system_core_count) {
                     worker_affinity[idx] = system_core_count - idx - 1;
diff --git a/src/common/env/env.hpp b/src/common/env/env.hpp
index 6a0b5ae57..f9d25d07b 100644
--- a/src/common/env/env.hpp
+++ b/src/common/env/env.hpp
@@ -133,8 +133,10 @@ class env_data {
     std::string alltoallv_algo_raw;
     std::string barrier_algo_raw;
     std::string bcast_algo_raw;
+    std::string recv_algo_raw;
     std::string reduce_algo_raw;
     std::string reduce_scatter_algo_raw;
+    std::string send_algo_raw;
     // scale-out selection part
     std::string allgatherv_scaleout_algo_raw;
     std::string allreduce_scaleout_algo_raw;
@@ -142,8 +144,10 @@ class env_data {
     std::string alltoallv_scaleout_algo_raw;
     std::string barrier_scaleout_algo_raw;
     std::string bcast_scaleout_algo_raw;
+    std::string recv_scaleout_algo_raw;
     std::string reduce_scaleout_algo_raw;
     std::string reduce_scatter_scaleout_algo_raw;
+    std::string send_scaleout_algo_raw;
     int enable_unordered_coll;
 
     int enable_fusion;
@@ -181,6 +185,11 @@ class env_data {
     int allgatherv_monolithic_pipeline_kernel;
     int alltoallv_monolithic_kernel;
     int alltoallv_monolithic_read_kernel;
+
+    size_t allgatherv_pipe_chunk_count;
+    size_t allreduce_pipe_chunk_count;
+    size_t reduce_scatter_pipe_chunk_count;
+    size_t reduce_pipe_chunk_count;
 #endif // CCL_ENABLE_SYCL
 
     int allreduce_nreduce_buffering;
@@ -223,9 +232,15 @@ class env_data {
     int enable_sycl_output_event;
     int use_hmem;
 
+    int sync_barrier;
+
     int enable_ze_barrier;
     int enable_ze_bidir_algo;
     int enable_ze_cache;
+    int ze_device_cache_evict_smallest;
+    long ze_device_cache_upper_limit;
+    int ze_device_cache_num_blocks_in_chunk;
+    ccl::ze::device_cache_policy_mode ze_device_cache_policy;
     int enable_ze_cache_cmdlists;
     int enable_ze_cache_cmdqueues;
     int enable_ze_cache_event_pools;
@@ -235,7 +250,8 @@ class env_data {
     int enable_ze_single_list;
     int disable_ze_family_check;
     int disable_ze_port_check;
-    int ze_disable_oversubscription_check;
+    int ze_enable_oversubscription_fallback;
+    int ze_enable_oversubscription_throw;
     int ze_serialize_mode;
     ccl::ze::copy_engine_mode ze_copy_engine;
     ccl::ze::h2d_copy_engine_mode ze_h2d_copy_engine;
@@ -245,7 +261,6 @@ class env_data {
     int enable_ze_list_dump;
     int ze_queue_index_offset;
     int ze_close_ipc_wa;
-    int enable_ze_cmd_bypass;
     std::string ze_lib_path;
     int ze_enable;
     int ze_fini_wa;
@@ -253,6 +268,8 @@ class env_data {
     int enable_ze_auto_tune_ports;
     ccl::ze::ipc_exchange_mode ze_ipc_exchange;
     int ze_drm_bdf_support;
+    int ze_pt2pt_read;
+    type2_tune_mode type2_mode;
 #endif // CCL_ENABLE_SYCL
 
 #ifdef CCL_ENABLE_PMIX
diff --git a/src/common/env/vars.hpp b/src/common/env/vars.hpp
index 6d3f251d0..d00f19ddf 100644
--- a/src/common/env/vars.hpp
+++ b/src/common/env/vars.hpp
@@ -286,7 +286,41 @@ constexpr const char* CCL_REDUCE = "CCL_REDUCE";
  * By-default: "direct"
  */
 constexpr const char* CCL_REDUCE_SCATTER = "CCL_REDUCE_SCATTER";
+
+/**
+ * @brief Set recv algorithm
+ *
+ * @details
+ * RECV algorithms
+ *  - direct        Using prepost(d2h-h2d) copies to get host buffers to invoke mpi/ofi->recv()
+ *  - topo          Topo scale-up algorithm (available if sycl and l0 are enabled)
+ *  - offload       Using device buffers directly into mpi/ofi layer
+ *                  skipping prepost copies d2h h2d. By-default used for scale-out.
+ *                  Setting extra MPI env vars for getting better performance
+ *                  (available if sycl and l0 are enabled)
+ *
+ * By-default: "topo" if sycl and l0 are enabled,
+ *      otherwise offload for ofi/mpi transport
+ */
+constexpr const char* CCL_RECV = "CCL_RECV";
+/**
+ * @brief Set send algorithm
+ *
+ * @details
+ * SEND algorithms
+ *  - direct        Using prepost(d2h-h2d) copies to get host buffers to invoke mpi/ofi->send()
+ *  - topo          Topo scale-up algorithm (available if sycl and l0 are enabled)
+ *  - offload       Using device buffers directly into mpi/ofi layer
+ *                  skipping prepost copies d2h h2d. By-default used for scale-out.
+ *                  Setting extra MPI env vars for getting better performance
+ *                  (available if sycl and l0 are enabled)
+ *
+ * By-default: "topo" if sycl and l0 are enabled,
+ *      otherwise offload for ofi/mpi transport
+ */
+constexpr const char* CCL_SEND = "CCL_SEND";
 /** @} */
+
 constexpr const char* CCL_UNORDERED_COLL = "CCL_UNORDERED_COLL";
 /*
  * SCALEOUT
@@ -451,7 +485,32 @@ constexpr const char* CCL_RS_MIN_CHUNK_SIZE = "CCL_RS_MIN_CHUNK_SIZE";
 constexpr const char* CCL_ALLGATHERV_TOPO_LARGE_SCALE = "CCL_ALLGATHERV_TOPO_LARGE_SCALE";
 constexpr const char* CCL_ALLGATHERV_TOPO_READ = "CCL_ALLGATHERV_TOPO_READ";
 constexpr const char* CCL_ALLTOALLV_TOPO_READ = "CCL_ALLTOALLV_TOPO_READ";
+/**
+ * @addtogroup OneCCLvars
+ * @{
+ */
+/**
+ * @brief Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers.
+ *
+ * @details
+ *
+ * Syntax
+ * CCL_REDUCE_SCATTER_TOPO_READ="<value>"
+ *
+ * Arguments
+ *
+ * "<value>"	Description
+ * 	- 1	Uses read based copy to transfer data across GPUs for the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives (default).
+ * 	- 0	Uses write based copy to transfer data across GPUs for the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives.
+ *
+ * Description
+ *
+ * Set this environment variable to select read or write based device-to-device data copy during the reduce_scatter stage of Allreduce, Reduce, and Reduce-Scatter collectives using device (GPU) buffers.
+ *
+ * By-default: "1"
+ */
 constexpr const char* CCL_REDUCE_SCATTER_TOPO_READ = "CCL_REDUCE_SCATTER_TOPO_READ";
+/** @} */
 /**
  * @addtogroup OneCCLvars
  * @{
@@ -529,6 +588,137 @@ constexpr const char* CCL_ALLTOALLV_MONOLITHIC_KERNEL = "CCL_ALLTOALLV_MONOLITHI
 /** @} */
 constexpr const char* CCL_ALLTOALLV_MONOLITHIC_READ_KERNEL = "CCL_ALLTOALLV_MONOLITHIC_READ_KERNEL";
 constexpr const char* CCL_REDUCE_MONOLITHIC_KERNEL = "CCL_REDUCE_MONOLITHIC_KERNEL";
+
+/**
+ * @addtogroup OneCCLvars
+ * @{
+ */
+/**
+ * @brief Set this environment variable to enable pipelining implementation for Allgatherv collectives using device (GPU) buffers
+ *
+ * @details
+ *
+ * Syntax
+ *
+ * CCL_ALLGATHERV_PIPE_CHUNK_COUNT="<value>"
+ * Arguments
+ *
+ * "<value>"	Description
+ * 	- 0:    (default) Bypasses the chunking/pipelining code and directly calls
+ *          the topology-aware code
+ * 	- 1:    Calls the pipelining code with a single chunk. Effectively, it has
+ *          identical behavior and performance as with "0", but exercises the
+ *          chunking code path with a single chunk.
+ *  - 2 or higher:  Divides the message into as many logical parts, or chunks,
+ *          as specified. Then, it executes the collective with each logical
+ *          chunk. This should allow for several phases of the algorithm to
+ *          run in parallel, as long as they don't use the same physical
+ *          resource. Effectively, this should increase performance.
+ *
+ * Description
+ *
+ * Set this environment variable to enable control how many chunks are used for
+ * Allgatherv, pipeline-based collectives using device (GPU) buffers.
+ *
+ * By-default: "0"
+ */
+constexpr const char* CCL_ALLGATHERV_PIPE_CHUNK_COUNT = "CCL_ALLGATHERV_PIPE_CHUNK_COUNT";
+
+/**
+ * @brief Set this environment variable to enable pipelining implementation for Allreduce collectives using device (GPU) buffers
+ *
+ * @details
+ *
+ * Syntax
+ *
+ * CCL_ALLREDUCE_PIPE_CHUNK_COUNT="<value>"
+ * Arguments
+ *
+ * "<value>"	Description
+ * 	- 0:    (default) Bypasses the chunking/pipelining code and directly calls
+ *          the topology-aware code
+ * 	- 1:    Calls the pipelining code with a single chunk. Effectively, it has
+ *          identical behavior and performance as with "0", but exercises the
+ *          chunking code path with a single chunk.
+ *  - 2 or higher:  Divides the message into as many logical parts, or chunks,
+ *          as specified. Then, it executes the collective with each logical
+ *          chunk. This should allow for several phases of the algorithm to
+ *          run in parallel, as long as they don't use the same physical
+ *          resource. Effectively, this should increase performance.
+ *
+ * Description
+ *
+ * Set this environment variable to enable control how many chunks are used for
+ * Allreduce pipeline-based collectives using device (GPU) buffers.
+ *
+ * By-default: "0"
+ */
+constexpr const char* CCL_ALLREDUCE_PIPE_CHUNK_COUNT = "CCL_ALLREDUCE_PIPE_CHUNK_COUNT";
+
+/**
+ * @brief Set this environment variable to enable pipelining implementation for Reduce_Scatter collectives using device (GPU) buffers
+ *
+ * @details
+ *
+ * Syntax
+ *
+ * CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT="<value>"
+ * Arguments
+ *
+ * "<value>"	Description
+ * 	- 0:    (default) Bypasses the chunking/pipelining code and directly calls
+ *          the topology-aware code
+ * 	- 1:    Calls the pipelining code with a single chunk. Effectively, it has
+ *          identical behavior and performance as with "0", but exercises the
+ *          chunking code path with a single chunk.
+ *  - 2 or higher:  Divides the message into as many logical parts, or chunks,
+ *          as specified. Then, it executes the collective with each logical
+ *          chunk. This should allow for several phases of the algorithm to
+ *          run in parallel, as long as they don't use the same physical
+ *          resource. Effectively, this should increase performance.
+ *
+ * Description
+ *
+ * Set this environment variable to enable control how many chunks are used for
+ * Reduce_Scatter pipeline-based collectives using device (GPU) buffers.
+ *
+ * By-default: "0"
+ */
+constexpr const char* CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT = "CCL_REDUCE_SCATTER_PIPE_CHUNK_COUNT";
+
+/**
+ * @brief Set this environment variable to enable pipelining implementation for Reduce collectives using device (GPU) buffers
+ *
+ * @details
+ *
+ * Syntax
+ *
+ * CCL_REDUCE_PIPE_CHUNK_COUNT="<value>"
+ * Arguments
+ *
+ * "<value>"	Description
+ * 	- 0:    (default) Bypasses the chunking/pipelining code and directly calls
+ *          the topology-aware code
+ * 	- 1:    Calls the pipelining code with a single chunk. Effectively, it has
+ *          identical behavior and performance as with "0", but exercises the
+ *          chunking code path with a single chunk.
+ *  - 2 or higher:  Divides the message into as many logical parts, or chunks,
+ *          as specified. Then, it executes the collective with each logical
+ *          chunk. This should allow for several phases of the algorithm to
+ *          run in parallel, as long as they don't use the same physical
+ *          resource. Effectively, this should increase performance.
+ *
+ * Description
+ *
+ * Set this environment variable to enable control how many chunks are used for
+ * Reduce pipeline-based collectives using device (GPU) buffers.
+ *
+ * By-default: "0"
+ */
+constexpr const char* CCL_REDUCE_PIPE_CHUNK_COUNT = "CCL_REDUCE_PIPE_CHUNK_COUNT";
+
+/** @} */
+
 #endif // CCL_ENABLE_SYCL
 
 constexpr const char* CCL_ALLREDUCE_NREDUCE_BUFFERING = "CCL_ALLREDUCE_NREDUCE_BUFFERING";
@@ -645,10 +835,41 @@ constexpr const char* CCL_USE_HMEM = "CCL_USE_HMEM";
 constexpr const char* CCL_ZE_BARRIER = "CCL_ZE_BARRIER";
 constexpr const char* CCL_ZE_BIDIR_ALGO = "CCL_ZE_BIDIR_ALGO";
 constexpr const char* CCL_ZE_CACHE = "CCL_ZE_CACHE";
+constexpr const char* CCL_ZE_DEVICE_CACHE_EVICT_SMALLEST = "CCL_ZE_DEVICE_CACHE_EVICT_SMALLEST";
+constexpr const char* CCL_ZE_DEVICE_CACHE_UPPER_LIMIT = "CCL_ZE_DEVICE_CACHE_UPPER_LIMIT";
+constexpr const char* CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK = "CCL_ZE_DEVICE_CACHE_NUM_BLOCKS_IN_CHUNK";
+constexpr const char* CCL_ZE_DEVICE_CACHE_POLICY = "CCL_ZE_DEVICE_CACHE_POLICY";
 constexpr const char* CCL_ZE_CACHE_OPEN_IPC_HANDLES = "CCL_ZE_CACHE_OPEN_IPC_HANDLES";
 constexpr const char* CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD = "CCL_ZE_CACHE_OPEN_IPC_HANDLES_THRESHOLD";
 constexpr const char* CCL_ZE_CACHE_GET_IPC_HANDLES = "CCL_ZE_CACHE_GET_IPC_HANDLES";
-constexpr const char* CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK = "CCL_ZE_DISABLE_OVERSUBSCRIPTION_CHECK";
+/**
+ * @addtogroup OneCCLvars
+ * @{
+ */
+/**
+ * @brief Set to enable oversubscription in topo fallback stage for
+ * all collectives.
+ *
+ * @details This enviroment variable enables or disables the oversubscription fallback
+ * from topo algorithm to copy in/out
+ *
+ * "<value>" :  "0", "1"
+ *
+ * By-default: "1"
+ */
+constexpr const char* CCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK = "CCL_ZE_ENABLE_OVERSUBSCRIPTION_FALLBACK";
+/**
+ * @brief Set to enable oversubscription throw for all collectives.
+ *
+ * @details This enviroment variable enables or disables the oversubscription throw check
+ *
+ * "<value>" :  "0", "1"
+ *
+ * By-default: "1"
+ */
+constexpr const char* CCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW = "CCL_ZE_ENABLE_OVERSUBSCRIPTION_THROW";
+/** @} */
+
 constexpr const char* CCL_ZE_SERIALIZE = "CCL_ZE_SERIALIZE";
 
 constexpr const char* CCL_ZE_COPY_ENGINE = "CCL_ZE_COPY_ENGINE";
diff --git a/src/common/env/vars_experimental.hpp b/src/common/env/vars_experimental.hpp
index 701783fe7..811fac8a6 100644
--- a/src/common/env/vars_experimental.hpp
+++ b/src/common/env/vars_experimental.hpp
@@ -118,6 +118,56 @@ constexpr const char* CCL_REDUCE_SCATTER_FALLBACK_ALGO = "CCL_REDUCE_SCATTER_FAL
  * By-default: "1"
  */
 constexpr const char* CCL_ZE_AUTO_TUNE_PORTS = "CCL_ZE_AUTO_TUNE_PORTS";
+
+/**
+ * @brief Enable switching of read and write protocols for pt2pt topo algorithm
+ *
+ * @details Control pt2pt read/write protocols.\n Read Protocol:\n
+ * It means SEND side is exchanging the handle with RECV side.
+ * Then execute the copy operation on the RECV operation side, where the dst buf
+ * is the local buffer and the source buffer is the remote buffer.\n
+ *
+ * Write Protocol:\n
+ * it means RECV side is exchanging the handle with SEND side.
+ * Execute the copy operation on the SEND operation side, where the dst buf is the
+ * remote buffer and the source buffer is the local buffer.
+ *\n
+ * "<value>" :  "0", "1"
+ *\n
+ * By-default: "1"
+ */
+constexpr const char* CCL_ZE_PT2PT_READ = "CCL_ZE_PT2PT_READ";
+
+/**
+ * @brief Tunable value for collectives to adjust copy engine indexes
+ *
+ * @details use 2,4,6 copy engine indexes for host with 6 ports
+ * for allreduce, reduce and allgatherv
+ * "<value>":
+ * "on" - always use write mode with calculated indexes
+ * "off" - always disabled
+ * "detected" - determined by the logic in detection
+ * "undetected" - the default value, used before the logic in
+ * detection
+ *
+ * By-default: "undetected"
+ */
+constexpr const char* CCL_ZE_TYPE2_TUNE_PORTS = "CCL_ZE_TYPE2_TUNE_PORTS";
+
+/**
+ * @brief Switch ccl::barrier() host-sync / host-async options
+ *
+ * @details Historically ccl::barrier() was always synchronous.
+ * That does not match with oneCCL asynchronous concept. Same as other
+ * collectives, ccl::barrier() should be host-asynchronous if possible.
+ * As it would be too much to change in one moment, we start through
+ * experimental variable which introduces the option to make barrier
+ * host-asynchronous. Use CCL_BARRIER_SYNC=0 to achieve that.
+ *
+ * By-default: "1 (SYNC)"
+ */
+constexpr const char* CCL_BARRIER_SYNC = "CCL_BARRIER_SYNC";
+/** @} */
 /** @} */
 
 #endif // CCL_ENABLE_SYCL
diff --git a/src/common/event/impls/host_event.cpp b/src/common/event/impls/host_event.cpp
index 89acb8923..aac4780f2 100644
--- a/src/common/event/impls/host_event.cpp
+++ b/src/common/event/impls/host_event.cpp
@@ -13,6 +13,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
+#include "coll/coll_util.hpp"
 #include "common/request/request.hpp"
 #include "common/event/impls/host_event.hpp"
 #include "exec/exec.hpp"
@@ -30,6 +31,13 @@ host_event_impl::host_event_impl(ccl_request* r) : req(r) {
     }
 #ifdef CCL_ENABLE_SYCL
     native_event = req->share_native_event();
+    sync_event = req->share_sync_event();
+#ifdef CCL_ENABLE_ZE
+    if (sync_event) {
+        stream = req->get_sched()->coll_param.stream;
+        ze_context = stream->get_ze_context();
+    }
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
     if (req->synchronous) {
         if (!ccl::global_data::get().executor.get()->is_locked) {
@@ -39,7 +47,6 @@ host_event_impl::host_event_impl(ccl_request* r) : req(r) {
         // in place and in this case we mark request as completed,
         // all calls to wait() or test() will do nothing
         completed = true;
-        synchronous = true;
     }
 }
 
@@ -63,15 +70,40 @@ host_event_impl::~host_event_impl() {
     // but we need to ensure that the bound schedule is actually destroyed. For this
     // to happen, call wait() to do a proper finalization and cleanup.
     wait();
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (sync_event) {
+        auto& pools = ccl::global_data::get().ze_data->dynamic_event_pools;
+        auto pool_it = pools.find(ze_context);
+        if (pool_it == pools.end()) {
+            LOG_ERROR("pool must be initialized for the context");
+        }
+        else {
+            try {
+                pool_it->second.put_event(ccl::utils::get_native_event(*sync_event));
+            }
+            // runtime_error is __SYCL2020_DEPRECATED, catch generic exception
+            catch (sycl::exception& e) {
+                LOG_ERROR(
+                    "sycl event not recovered: ", e.what(), " potential resource/memory leak");
+            }
+        }
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 }
 
 void host_event_impl::wait() {
     if (!completed) {
         auto* exec = ccl::global_data::get().executor.get();
-        ccl_wait_impl(exec, req);
-        if (synchronous && !exec->is_locked) {
+        auto wait_result = ccl_wait_impl(exec, req);
+        if (wait_result == ccl_wait_result_completed_not_released) {
             ccl_release_request(req);
         }
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+        //TODO call native_event->wait() both for out-of-order and in-order queues (MLSL-2374)
+        if (native_event && ccl::is_queue_in_order(stream))
+            native_event->wait();
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
         completed = true;
     }
 }
diff --git a/src/common/event/impls/host_event.hpp b/src/common/event/impls/host_event.hpp
index b2be10e2f..67f6937a6 100644
--- a/src/common/event/impls/host_event.hpp
+++ b/src/common/event/impls/host_event.hpp
@@ -14,6 +14,7 @@
  limitations under the License.
 */
 #pragma once
+#include "oneapi/ccl.hpp"
 #include "oneapi/ccl/types.hpp"
 #include "oneapi/ccl/types_policy.hpp"
 #include "common/event/impls/event_impl.hpp"
@@ -32,15 +33,21 @@ class host_event_impl final : public event_impl {
     bool test() override;
     bool cancel() override;
     event::native_t& get_native() override;
+    host_event_impl& operator=(const host_event_impl&) = delete;
+    host_event_impl(const host_event_impl&) = delete;
 
 private:
     ccl_request* req = nullptr;
     bool completed = false;
-    bool synchronous = false;
 
 #ifdef CCL_ENABLE_SYCL
     // the actual sycl::event returned to the user via ccl::event.get_native()
     std::shared_ptr<sycl::event> native_event;
+    std::shared_ptr<sycl::event> sync_event;
+    ze_context_handle_t ze_context{};
+#ifdef CCL_ENABLE_ZE
+    ccl_stream* stream = nullptr;
+#endif // CCL_ENABLE_ZE
 #endif // CCL_ENABLE_SYCL
 };
 
diff --git a/src/common/global/global.cpp b/src/common/global/global.cpp
index ba52c2feb..bafd780f2 100644
--- a/src/common/global/global.cpp
+++ b/src/common/global/global.cpp
@@ -133,6 +133,9 @@ void global_data::init_resize_independent_objects() {
     algorithm_selector->init();
 
     hwloc_wrapper.reset(new ccl_hwloc_wrapper());
+
+    metrics_profiler.reset(new profile::metrics_manager());
+    metrics_profiler->init();
 }
 
 void global_data::reset_resize_dependent_objects() {
@@ -146,6 +149,7 @@ void global_data::reset_resize_independent_objects() {
     parallelizer.reset();
     algorithm_selector.reset();
     hwloc_wrapper.reset();
+    metrics_profiler.reset();
 }
 
 void global_data::getenv_local_coord(const char* local_proc_idx_env_name,
diff --git a/src/common/global/global.hpp b/src/common/global/global.hpp
index 2804cd61e..c9e9de517 100644
--- a/src/common/global/global.hpp
+++ b/src/common/global/global.hpp
@@ -81,13 +81,14 @@ class global_data {
     std::unique_ptr<ccl_fusion_manager> fusion_manager;
     std::unique_ptr<ccl_algorithm_selector_wrapper<CCL_COLL_LIST>> algorithm_selector;
     std::unique_ptr<ccl_hwloc_wrapper> hwloc_wrapper;
+    std::unique_ptr<profile::metrics_manager> metrics_profiler;
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
     std::unique_ptr<ze::global_data_desc> ze_data;
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
     static thread_local bool is_worker_thread;
-    bool is_ft_enabled;
+    bool is_ft_enabled{ false };
 
     int get_local_proc_idx() const {
         return local_proc_idx;
@@ -109,14 +110,14 @@ class global_data {
     void init_resize_independent_objects();
     void reset_resize_independent_objects();
 
-    int local_proc_idx;
-    int local_proc_count;
+    int local_proc_idx{ ccl_comm::invalid_rank };
+    int local_proc_count{ ccl::utils::invalid_err_code };
     void getenv_local_coord(const char* local_proc_idx_env_name,
                             const char* local_proc_count_env_name);
     void set_local_coord();
 
-    env_data env_object;
-    os_information os_info;
+    env_data env_object{};
+    os_information os_info{};
 };
 
 } // namespace ccl
diff --git a/src/common/global/ze/ze_data.cpp b/src/common/global/ze/ze_data.cpp
index a5357171e..20d91fc9d 100644
--- a/src/common/global/ze/ze_data.cpp
+++ b/src/common/global/ze/ze_data.cpp
@@ -41,7 +41,7 @@ global_data_desc::global_data_desc() {
 
     // enables driver initialization and
     // dependencies for system management
-    setenv("ZES_ENABLE_SYSMAN", "1", 0);
+    setenv("ZES_ENABLE_SYSMAN", "1", 1);
 
     ZE_CALL(zeInit, (ZE_INIT_FLAG_GPU_ONLY));
 
diff --git a/src/common/global/ze/ze_data.hpp b/src/common/global/ze/ze_data.hpp
index 3b3afab48..e8d98e7fe 100644
--- a/src/common/global/ze/ze_data.hpp
+++ b/src/common/global/ze/ze_data.hpp
@@ -18,7 +18,7 @@
 #include <unordered_map>
 
 #include "common/global/ze/ze_fd_manager.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/ze/ze_event_manager.hpp"
 
diff --git a/src/common/global/ze/ze_fd_manager.cpp b/src/common/global/ze/ze_fd_manager.cpp
index 49061e440..a6cd24a9c 100644
--- a/src/common/global/ze/ze_fd_manager.cpp
+++ b/src/common/global/ze/ze_fd_manager.cpp
@@ -20,7 +20,6 @@
 #include "common/utils/utils.hpp"
 #include "common/utils/yield.hpp"
 
-#include <dirent.h>
 #ifdef CCL_ENABLE_DRM
 #include "i915_drm.h"
 #endif // CCL_ENABLE_DRM
@@ -75,9 +74,13 @@ bool fd_manager::is_pidfd_supported() {
         fds.push_back(fd);
     };
 
+    mode_t prev_umask = umask(rwe_umask);
+
     int file_fd = mkstemp(filename);
     check_fd(file_fd);
 
+    umask(prev_umask);
+
     int pidfd = syscall(__NR_pidfd_open, pid, 0);
     check_fd(pidfd);
 
@@ -208,6 +211,7 @@ std::vector<int> fd_manager::init_device_fds() {
 
     DIR *dir = opendir(device_dir);
     CCL_THROW_IF_NOT(dir, "opendir failed: could not open device directory");
+    dir_raii dir_obj(dir);
 
     LOG_DEBUG("search for all devices in the device directory");
     while ((ent = readdir(dir)) != nullptr) {
@@ -219,6 +223,7 @@ std::vector<int> fd_manager::init_device_fds() {
         CCL_THROW_IF_NOT(ret > 0 || ret <= NAME_MAX, "could not create device name");
         device_names.push_back(device_name);
     }
+
     return fill_device_fds(device_names);
 }
 
@@ -234,6 +239,7 @@ std::vector<bdf_info> fd_manager::init_device_bdfs(const size_t size) {
 
     DIR *dir = opendir(device_dir);
     CCL_THROW_IF_NOT(dir, "bdfs opendir failed: could not open device directory");
+    dir_raii dir_obj(dir);
 
     LOG_DEBUG("BDF search for all devices in the device directory");
     while ((ent = readdir(dir)) != nullptr) {
@@ -256,8 +262,6 @@ std::vector<bdf_info> fd_manager::init_device_bdfs(const size_t size) {
 
     qsort(&bdfs[0], bdfs.size(), sizeof(bdf_info), fd_manager::compare_bdf);
     LOG_DEBUG("sorted bdf size: ", bdfs.size());
-
-    closedir(dir);
 #endif // ZE_PCI_PROPERTIES_EXT_NAME
     return bdfs;
 }
@@ -558,7 +562,11 @@ void fd_manager::exchange_device_fds() {
                              ", errno: ",
                              strerror(errno));
 
-            setsockopt(all_socks[i], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+            if (setsockopt(all_socks[i], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)) ==
+                ccl::utils::invalid_err_code) {
+                CCL_THROW("setsockopt failed: sock: ", all_socks[i], ", errno: ", strerror(errno));
+            }
+
             sockaddr.sun_family = AF_UNIX;
             strncpy(sockaddr.sun_path, sock_name.c_str(), sizeof(sockaddr.sun_path) - 1);
 
@@ -602,7 +610,10 @@ void fd_manager::exchange_device_fds() {
                          ", errno: ",
                          strerror(errno));
 
-        setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+        if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)) ==
+            ccl::utils::invalid_err_code) {
+            CCL_THROW("setsockopt failed: sock: ", sock, ", errno: ", strerror(errno));
+        }
         sockaddr.sun_family = AF_UNIX;
         strncpy(sockaddr.sun_path, sock_name.c_str(), sizeof(sockaddr.sun_path) - 1);
 
@@ -631,7 +642,10 @@ void fd_manager::exchange_device_fds() {
                          ", errno: ",
                          strerror(errno));
 
-        setsockopt(all_socks[0], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+        if (setsockopt(all_socks[0], SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)) ==
+            ccl::utils::invalid_err_code) {
+            CCL_THROW("setsockopt failed: sock: ", all_socks[0], ", errno: ", strerror(errno));
+        }
         close(sock);
     }
 
diff --git a/src/common/global/ze/ze_fd_manager.hpp b/src/common/global/ze/ze_fd_manager.hpp
index 79d2d5c18..a5df5430e 100644
--- a/src/common/global/ze/ze_fd_manager.hpp
+++ b/src/common/global/ze/ze_fd_manager.hpp
@@ -17,6 +17,8 @@
 
 #include "oneapi/ccl/config.h"
 
+#include <dirent.h>
+
 #include <map>
 #include <string>
 #include <vector>
@@ -34,6 +36,49 @@ static std::map<ipc_exchange_mode, std::string> ipc_exchange_names = {
     std::make_pair(ipc_exchange_mode::none, "none")
 };
 
+// RAII
+class dir_raii {
+public:
+    dir_raii(DIR* dir) : dir_(dir) {}
+
+    // copy constructor
+    dir_raii(const dir_raii& other) : dir_(other.dir_) {}
+
+    // move constructor
+    dir_raii(dir_raii&& other) : dir_(other.dir_) {
+        other.dir_ = nullptr;
+    }
+
+    // copy assignment operator
+    dir_raii& operator=(const dir_raii& other) {
+        if (this != &other) {
+            dir_ = other.dir_;
+        }
+        return *this;
+    }
+
+    // move assignment operator
+    dir_raii& operator=(dir_raii&& other) {
+        if (this != &other) {
+            if (dir_) {
+                closedir(dir_);
+            }
+            dir_ = other.dir_;
+            other.dir_ = nullptr;
+        }
+        return *this;
+    }
+
+    ~dir_raii() {
+        if (dir_) {
+            closedir(dir_);
+        }
+    }
+
+private:
+    DIR* dir_;
+};
+
 struct bdf_info {
     uint32_t domain{};
     uint32_t bus{};
@@ -53,6 +98,10 @@ class fd_manager {
     static constexpr int invalid_physical_idx = -1;
     static constexpr int hexadecimal_base = 16;
     static constexpr int bdf_start_pos = 12;
+    // it temporarily sets the default permissions to
+    // read, write, and execute for the owner, and no
+    // permissions for group and others
+    static constexpr mode_t rwe_umask = 077;
 
     fd_manager();
     fd_manager(const fd_manager&) = delete;
diff --git a/src/common/request/request.cpp b/src/common/request/request.cpp
index 802a9e961..9058fe997 100644
--- a/src/common/request/request.cpp
+++ b/src/common/request/request.cpp
@@ -37,11 +37,6 @@ ccl_request::~ccl_request() {
     if (counter != 0 && !ccl::global_data::get().is_ft_enabled) {
         LOG_WARN("unexpected completion_counter ", counter);
     }
-
-    // notify sched about request release to update its state.
-    // if event is empty, sched will that
-    // TODO: move to the proper place
-    // sched.release_sync_event(this);
 }
 
 bool ccl_request::complete() {
diff --git a/src/common/request/request.hpp b/src/common/request/request.hpp
index bbd4d8081..ba82060a9 100644
--- a/src/common/request/request.hpp
+++ b/src/common/request/request.hpp
@@ -70,6 +70,10 @@ class alignas(CACHELINE_SIZE) ccl_request {
         return *sync_event;
     }
 
+    std::shared_ptr<sycl::event>& share_sync_event() {
+        return sync_event;
+    }
+
     bool has_output_event() const {
         // by default the event is empty
         if (!native_event)
diff --git a/src/common/stream/stream.hpp b/src/common/stream/stream.hpp
index b1a763dd5..3a29c8069 100644
--- a/src/common/stream/stream.hpp
+++ b/src/common/stream/stream.hpp
@@ -29,7 +29,7 @@
 
 namespace ccl {
 
-enum class device_family { unknown, family1, family2 };
+enum class device_family { unknown, family1, family2, family3 };
 
 std::string to_string(device_family family);
 
diff --git a/src/common/utils/exchange_utils.cpp b/src/common/utils/exchange_utils.cpp
index de35fb7e3..dfc431cb6 100644
--- a/src/common/utils/exchange_utils.cpp
+++ b/src/common/utils/exchange_utils.cpp
@@ -67,6 +67,69 @@ bool allgatherv(std::shared_ptr<atl_base_comm> comm,
     return ret;
 }
 
+void check(std::shared_ptr<atl_base_comm> comm, atl_req_t& req) {
+    atl_status_t atl_status = comm->check(0, req);
+
+    if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
+        CCL_THROW("check failed: atl_status: ", atl_status_to_str(atl_status));
+    }
+
+    while (!req.is_completed) {
+        atl_status_t status = comm->check(0, req);
+        if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
+            CCL_THROW("check failed: atl_status: ", atl_status_to_str(status));
+        }
+        if (req.is_completed) {
+            break;
+        }
+    }
+}
+
+void recv(std::shared_ptr<atl_base_comm> comm,
+          void* buf,
+          int count,
+          int peer_rank,
+          uint64_t tag,
+          bool sync) {
+    atl_req_t req{};
+    comm->recv(0 /* ep_idx */, buf, count, peer_rank /*src rank*/, tag, req);
+
+    if (sync) {
+        check(comm, req);
+    }
+    else {
+        CCL_THROW("unexpected sync parameter");
+    }
+}
+
+void send(std::shared_ptr<atl_base_comm> comm,
+          void* buf,
+          int count,
+          int peer_rank,
+          uint64_t tag,
+          bool sync) {
+    atl_req_t req{};
+    comm->send(0 /* ep_idx */, buf, count, peer_rank /*dst rank*/, tag, req);
+
+    if (sync) {
+        check(comm, req);
+    }
+    else {
+        CCL_THROW("unexpected sync parameter");
+    }
+}
+
+void send_ack_to_peer(std::shared_ptr<atl_base_comm> comm, uint64_t tag, int peer_rank) {
+    ccl::utils::send(std::move(comm), nullptr, 0, peer_rank, tag);
+    LOG_DEBUG("send ack msg with tag: ", tag);
+}
+
+void recv_ack_from_peer(std::shared_ptr<atl_base_comm> comm, uint64_t tag, int peer_rank) {
+    char ack[1];
+    ccl::utils::recv(std::move(comm), ack, 0, peer_rank, tag);
+    LOG_DEBUG("recv ack msg with tag: ", tag);
+}
+
 int check_msg_retval(std::string operation_name,
                      ssize_t bytes,
                      struct iovec iov,
diff --git a/src/common/utils/exchange_utils.hpp b/src/common/utils/exchange_utils.hpp
index af5106b64..3f44ea2dd 100644
--- a/src/common/utils/exchange_utils.hpp
+++ b/src/common/utils/exchange_utils.hpp
@@ -36,6 +36,25 @@ bool allgatherv(std::shared_ptr<atl_base_comm> comm,
                 const std::vector<int>& recv_bytes,
                 bool sync = true);
 
+void check(std::shared_ptr<atl_base_comm> comm, atl_req_t& req);
+
+void recv(std::shared_ptr<atl_base_comm> comm,
+          void* buf,
+          int count,
+          int peer_rank,
+          uint64_t tag,
+          bool sync = true);
+
+void send(std::shared_ptr<atl_base_comm> comm,
+          void* buf,
+          int count,
+          int peer_rank,
+          uint64_t tag,
+          bool sync = true);
+
+void send_ack_to_peer(std::shared_ptr<atl_base_comm> comm, uint64_t tag, int peer_rank);
+void recv_ack_from_peer(std::shared_ptr<atl_base_comm> comm, uint64_t tag, int peer_rank);
+
 int check_msg_retval(std::string operation_name,
                      ssize_t bytes,
                      struct iovec iov,
diff --git a/src/common/utils/profile.cpp b/src/common/utils/profile.cpp
new file mode 100644
index 000000000..c0cb6bd0b
--- /dev/null
+++ b/src/common/utils/profile.cpp
@@ -0,0 +1,46 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/utils/utils.hpp"
+#include "common/log/log.hpp"
+
+ccl::profile::metrics_manager::~metrics_manager() {
+    finalize();
+}
+
+void ccl::profile::metrics_manager::init() {
+    allreduce_pipe_nonparallel_calls_per_count.clear();
+    allreduce_pipe_parallel_calls_per_count.clear();
+}
+
+void ccl::profile::metrics_manager::finalize() {
+    std::string allreduce_pipe_metrics;
+
+    for (auto calls_per_count : allreduce_pipe_nonparallel_calls_per_count) {
+        allreduce_pipe_metrics += "nonparallel_calls_per_count[" +
+                                  std::to_string(calls_per_count.first) +
+                                  "]=" + std::to_string(calls_per_count.second) + ",\n";
+    }
+
+    for (auto calls_per_count : allreduce_pipe_parallel_calls_per_count) {
+        allreduce_pipe_metrics += "   parallel_calls_per_count[" +
+                                  std::to_string(calls_per_count.first) +
+                                  "]=" + std::to_string(calls_per_count.second) + ",\n";
+    }
+
+    if (!allreduce_pipe_metrics.empty()) {
+        LOG_INFO("allreduce_pipe_metrics: [\n", allreduce_pipe_metrics, "]");
+    }
+}
diff --git a/src/sched/entry/coll/coll_entry_param.hpp b/src/common/utils/profile.hpp
similarity index 50%
rename from src/sched/entry/coll/coll_entry_param.hpp
rename to src/common/utils/profile.hpp
index f19c31c18..29b375723 100644
--- a/src/sched/entry/coll/coll_entry_param.hpp
+++ b/src/common/utils/profile.hpp
@@ -15,24 +15,21 @@
 */
 #pragma once
 
-#include "coll/coll.hpp"
+#include <map>
 
-struct ccl_coll_entry_param {
-    ccl_coll_type ctype{ ccl_coll_last_value };
-    ccl_buffer send_buf{};
-    ccl_buffer recv_buf{};
-    size_t count{};
-    size_t send_count{};
-    std::vector<ccl_buffer> send_bufs;
-    std::vector<ccl_buffer> recv_bufs;
-    const size_t* send_counts{};
-    const size_t* recv_counts{};
-    ccl_datatype dtype{};
-    ccl::reduction reduction{ ccl::reduction::sum };
-    int root{};
-    int peer_rank = CCL_INVALID_PEER_RANK_IDX;
-    ccl_comm* comm{};
-    ccl_stream* stream{};
-    ccl_coll_algo hint_algo{};
-    bool is_scaleout{ false };
+namespace ccl {
+namespace profile {
+
+class metrics_manager {
+    void finalize();
+
+public:
+    std::map<size_t, size_t> allreduce_pipe_nonparallel_calls_per_count,
+        allreduce_pipe_parallel_calls_per_count;
+
+    void init();
+    ~metrics_manager();
 };
+
+} // namespace profile
+} // namespace ccl
diff --git a/src/common/utils/sycl_utils.cpp b/src/common/utils/sycl_utils.cpp
index 5447a424b..465135e1d 100644
--- a/src/common/utils/sycl_utils.cpp
+++ b/src/common/utils/sycl_utils.cpp
@@ -67,9 +67,9 @@ sycl::event submit_barrier(sycl::queue queue) {
 
 sycl::event submit_barrier(sycl::queue queue, sycl::event event) {
 #if ICPX_VERSION >= 140000
-    return queue.ext_oneapi_submit_barrier({ event });
+    return queue.ext_oneapi_submit_barrier({ std::move(event) });
 #elif ICPX_VERSION < 140000
-    return queue.submit_barrier({ event });
+    return queue.submit_barrier({ std::move(event) });
 #endif // ICPX_VERSION
 }
 
diff --git a/src/common/utils/utils.cpp b/src/common/utils/utils.cpp
index e1393fced..a5fe2be11 100644
--- a/src/common/utils/utils.cpp
+++ b/src/common/utils/utils.cpp
@@ -76,7 +76,7 @@ uintptr_t get_aligned_offset_byte(const void* ptr,
                                   const size_t buf_size_bytes,
                                   const size_t mem_align_bytes) {
     // find the number of data items to remove to start from aligned bytes
-    unsigned long pre_align_offset_byte = (uintptr_t)ptr % mem_align_bytes;
+    unsigned long pre_align_offset_byte = reinterpret_cast<uintptr_t>(ptr) % mem_align_bytes;
     if (pre_align_offset_byte != 0) {
         pre_align_offset_byte = mem_align_bytes - pre_align_offset_byte;
     }
diff --git a/src/common/utils/utils.hpp b/src/common/utils/utils.hpp
index e17eb97f8..b183d7250 100644
--- a/src/common/utils/utils.hpp
+++ b/src/common/utils/utils.hpp
@@ -22,6 +22,7 @@
 #include <algorithm>
 #include <chrono>
 #include <functional>
+#include <iomanip>
 #include <malloc.h>
 #include <map>
 #include <mutex>
@@ -31,6 +32,7 @@
 #include <sstream>
 #include <vector>
 
+#include "common/utils/profile.hpp"
 #include "common/utils/spinlock.hpp"
 #include "internal_types.hpp"
 
@@ -152,9 +154,20 @@ static constexpr int invalid_err_code = -1;
 static constexpr int invalid_fd = -1;
 static constexpr int invalid_mem_handle = -1;
 static constexpr int invalid_pid = -1;
+static constexpr size_t initial_count_value = 0;
+static constexpr int invalid_peer_rank = -1;
+static constexpr int invalid_rank = -1;
+static constexpr int invalid_host_idx = -1;
+static constexpr int invalid_bytes_value = -1;
 
 enum class align_kernels { unaligned, aligned, count };
 
+enum class pt2pt_handle_exchange_role { sender, receiver, none };
+struct pt2pt_handle_exchange_info {
+    int peer_rank = invalid_err_code;
+    pt2pt_handle_exchange_role role = pt2pt_handle_exchange_role::none;
+};
+
 size_t get_ptr_diff(const void* ptr1, const void* ptr2);
 size_t pof2(size_t number);
 size_t aligned_sz(size_t size, size_t alignment);
@@ -235,5 +248,12 @@ void clear_and_push_back(std::vector<T>& v, T elem) {
     v.push_back(elem);
 }
 
+template <typename T>
+std::string to_hex(T integer) {
+    std::stringstream ss;
+    ss << "0x" << std::hex << std::setw(sizeof(T) * 2) << std::setfill('0') << integer;
+    return ss.str();
+}
+
 } // namespace utils
 } // namespace ccl
diff --git a/src/exec/exec.cpp b/src/exec/exec.cpp
index 5ce3d257a..45d7a78af 100644
--- a/src/exec/exec.cpp
+++ b/src/exec/exec.cpp
@@ -251,13 +251,19 @@ void ccl_executor::start(ccl_sched* sched, bool extra_sched) {
         workers[0]->add(sched);
         return;
     }
-    size_t worker_idx;
     auto& partial_scheds = sched->get_subscheds();
+    CCL_ASSERT(!partial_scheds.empty(), "at least one sub-sched should exist");
+    /* Assign each partial scheduler to the worker in a round-robin manner.
+       Use worker_idx only as a starting point for assignment.
+       This approach covers the case when the sched_id maximum value overflows,
+       therefore partial schedulers ids are not sequential, which means that several schedulers
+       can be assigned to the same worker. This situation can lead to hangs on algos such as barrier. */
+    size_t worker_idx = get_worker_idx_by_sched_id(partial_scheds.front().get());
     for (size_t idx = 0; idx < partial_scheds.size(); idx++) {
-        worker_idx = get_worker_idx_by_sched_id(partial_scheds[idx].get());
         LOG_DEBUG(
             "worker idx: ", worker_idx, ", coll: ", ccl_coll_type_to_str(sched->coll_param.ctype));
         workers[worker_idx]->add(partial_scheds[idx].get());
+        worker_idx = (worker_idx + 1) % workers.size();
     }
 }
 
diff --git a/src/exec/exec.hpp b/src/exec/exec.hpp
index 6b6046bb1..186a7e41a 100644
--- a/src/exec/exec.hpp
+++ b/src/exec/exec.hpp
@@ -31,6 +31,12 @@ class ccl_worker;
 class ccl_service_worker;
 class ccl_sched;
 
+enum ccl_wait_result {
+    ccl_wait_result_completed_released,
+    ccl_wait_result_completed_not_released,
+    ccl_wait_result_not_completed,
+};
+
 class alignas(CACHELINE_SIZE) ccl_executor {
     friend class ccl_listener;
 
@@ -110,6 +116,7 @@ inline void ccl_release_sched(ccl_sched* sched) {
     }
 }
 
+/// frees memory if and only if req is detached from its scheduler
 inline void ccl_release_request(ccl_request* req) {
     auto* sched = req->get_sched();
 
@@ -127,7 +134,8 @@ inline void ccl_release_request(ccl_request* req) {
 }
 
 template <class sched_type = ccl_sched>
-inline void ccl_wait_impl(ccl_executor* exec, ccl_request* request) {
+inline ccl_wait_result ccl_wait_impl(ccl_executor* exec, ccl_request* request) {
+    ccl_wait_result ret = ccl_wait_result_not_completed;
     exec->wait(request);
     if (!exec->is_locked) {
         LOG_DEBUG(
@@ -135,9 +143,15 @@ inline void ccl_wait_impl(ccl_executor* exec, ccl_request* request) {
             request,
             " completed, sched ",
             ccl_coll_type_to_str(static_cast<sched_type*>(request->get_sched())->coll_param.ctype));
-        if (!request->synchronous)
+        if (!request->synchronous) {
             ccl_release_request(request);
+            ret = ccl_wait_result_completed_released;
+        }
+        else {
+            ret = ccl_wait_result_completed_not_released;
+        }
     }
+    return ret;
 }
 
 template <class sched_type = ccl_sched>
diff --git a/src/hwloc/hwloc_wrapper.cpp b/src/hwloc/hwloc_wrapper.cpp
index 50c6d3859..13714dd24 100644
--- a/src/hwloc/hwloc_wrapper.cpp
+++ b/src/hwloc/hwloc_wrapper.cpp
@@ -17,20 +17,17 @@
 #include "hwloc/hwloc_wrapper.hpp"
 
 ccl_numa_node::ccl_numa_node()
-        : idx(CCL_UNDEFINED_NUMA_NODE),
-          os_idx(CCL_UNDEFINED_NUMA_NODE),
+        : os_idx(CCL_UNDEFINED_NUMA_NODE),
           mem_in_mb(0),
           core_count(0),
           membind_support(0) {}
 
-ccl_numa_node::ccl_numa_node(int idx,
-                             int os_idx,
+ccl_numa_node::ccl_numa_node(int os_idx,
                              size_t mem_in_mb,
                              int core_count,
                              const std::vector<int>& cpus,
                              int membind_support)
-        : idx(idx),
-          os_idx(os_idx),
+        : os_idx(os_idx),
           mem_in_mb(mem_in_mb),
           core_count(core_count),
           cpus(cpus),
@@ -40,7 +37,7 @@ std::string ccl_numa_node::to_string() {
     std::stringstream ss;
 
     ss << "{"
-       << "idx: " << idx << ", memory: " << mem_in_mb << " MB"
+       << "os_idx: " << os_idx << ", memory: " << mem_in_mb << " MB"
        << ", cores: " << core_count << ", cpus: " << cpus.size() << ", membind: " << membind_support
        << "}";
 
@@ -84,7 +81,6 @@ ccl_hwloc_wrapper::ccl_hwloc_wrapper()
         LOG_WARN("no support for memory binding of current thread");
     }
 
-    int idx = 0;
     hwloc_obj_t numa_node = nullptr;
     while ((numa_node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, numa_node)) !=
            nullptr) {
@@ -106,8 +102,7 @@ ccl_hwloc_wrapper::ccl_hwloc_wrapper()
             }
         }
         numa_nodes.push_back(
-            ccl_numa_node(idx, os_idx, mem_in_mb, core_count, cpus, check_membind(os_idx)));
-        ++idx;
+            ccl_numa_node(os_idx, mem_in_mb, core_count, cpus, check_membind(os_idx)));
     }
 }
 
@@ -128,11 +123,7 @@ std::string ccl_hwloc_wrapper::to_string() {
         ss << "{\n";
         ss << "  membind_thread_supported: " << membind_thread_supported << "\n";
         for (auto& node : numa_nodes) {
-            ss << "  numa: { "
-               << "idx: " << node.idx << ", os idx: " << node.os_idx
-               << ", memory: " << node.mem_in_mb << " MB"
-               << ", cores: " << node.core_count << ", cpus: " << node.cpus.size()
-               << ", membind: " << node.membind_support << " }\n";
+            ss << "  numa: " << node.to_string() << "\n";
         }
         ss << "}";
     }
@@ -227,7 +218,7 @@ int ccl_hwloc_wrapper::get_numa_node_by_cpu(int cpu) {
     for (auto& node : numa_nodes) {
         for (auto cpu_idx : node.cpus) {
             if (cpu_idx == cpu) {
-                return node.idx;
+                return node.os_idx;
             }
         }
     }
@@ -241,20 +232,38 @@ ccl_numa_node ccl_hwloc_wrapper::get_numa_node(int numa_node) {
         return {};
     }
 
+    for (auto node : numa_nodes) {
+        if (node.os_idx == numa_node) {
+            return node;
+        }
+    }
+
+    // is_valid_numa_node() iterates through the numa_nodes vector. To avoid
+    // iterating through the vector twice, we first check whether we find it
+    // (see loop above) before calling is_valid_numa_node().
     if (!is_valid_numa_node(numa_node)) {
         LOG_WARN("invalid NUMA node ", numa_node, ", NUMA node count ", get_numa_node_count());
         return {};
     }
 
-    return numa_nodes[numa_node];
+    // We should never reach this point.
+    CCL_THROW("invalid NUMA node ",
+              numa_node,
+              ". (But this should've been caught by is_valid_numa_node)");
 }
 
 bool ccl_hwloc_wrapper::is_valid_numa_node(int numa_node) {
-    if ((numa_node == CCL_UNDEFINED_NUMA_NODE) || (numa_node < 0) ||
-        (numa_node >= static_cast<int>(get_numa_node_count()))) {
+    if ((numa_node == CCL_UNDEFINED_NUMA_NODE) || (numa_node < 0)) {
         return false;
     }
-    return true;
+
+    for (auto node : numa_nodes) {
+        if (node.os_idx == numa_node) {
+            return true;
+        }
+    }
+
+    return false;
 }
 
 bool ccl_hwloc_wrapper::check_membind(int numa_node) {
diff --git a/src/hwloc/hwloc_wrapper.hpp b/src/hwloc/hwloc_wrapper.hpp
index 2919c2857..586d796a9 100644
--- a/src/hwloc/hwloc_wrapper.hpp
+++ b/src/hwloc/hwloc_wrapper.hpp
@@ -20,7 +20,6 @@
 #define CCL_HWLOC_INVALID_NUMA_NODE (-1)
 
 struct ccl_numa_node {
-    int idx;
     int os_idx;
     size_t mem_in_mb;
     int core_count;
@@ -28,8 +27,7 @@ struct ccl_numa_node {
     int membind_support;
 
     ccl_numa_node();
-    ccl_numa_node(int idx,
-                  int os_idx,
+    ccl_numa_node(int os_idx,
                   size_t mem_in_mb,
                   int core_count,
                   const std::vector<int>& cpus,
diff --git a/src/kernels/kernels.cl b/src/kernels/kernels.cl
index fa0681602..f21d29314 100644
--- a/src/kernels/kernels.cl
+++ b/src/kernels/kernels.cl
@@ -17,25 +17,32 @@ __kernel void empty_kernel(int my_rank,
 
 #define PTR_ARGS(Dtype, name, b) __global Dtype* name##b
 
+// PTR_ARGS#: 1-7 args number, max case is 16 ranks
 #define PTR_ARGS1(Dtype, name) PTR_ARGS(Dtype, name, 1)
 #define PTR_ARGS2(Dtype, name) PTR_ARGS1(Dtype, name), PTR_ARGS(Dtype, name, 2)
 #define PTR_ARGS3(Dtype, name) PTR_ARGS2(Dtype, name), PTR_ARGS(Dtype, name, 3)
 #define PTR_ARGS4(Dtype, name) PTR_ARGS3(Dtype, name), PTR_ARGS(Dtype, name, 4)
 #define PTR_ARGS5(Dtype, name) PTR_ARGS4(Dtype, name), PTR_ARGS(Dtype, name, 5)
+#define PTR_ARGS6(Dtype, name) PTR_ARGS5(Dtype, name), PTR_ARGS(Dtype, name, 6)
+#define PTR_ARGS7(Dtype, name) PTR_ARGS6(Dtype, name), PTR_ARGS(Dtype, name, 7)
 
 #define ALL_PTR_ARGS(Dtype, name, N) PTR_ARGS(Dtype, name, 0), PTR_ARGS##N(Dtype, name)
 
 #define CONST_ARGS(Dtype, name, b) const Dtype name##b
 
+// CONST_ARGS#: 1-7 args number, max case is 16 ranks
 #define CONST_ARGS1(Dtype, name) CONST_ARGS(Dtype, name, 1)
 #define CONST_ARGS2(Dtype, name) CONST_ARGS1(Dtype, name), CONST_ARGS(Dtype, name, 2)
 #define CONST_ARGS3(Dtype, name) CONST_ARGS2(Dtype, name), CONST_ARGS(Dtype, name, 3)
 #define CONST_ARGS4(Dtype, name) CONST_ARGS3(Dtype, name), CONST_ARGS(Dtype, name, 4)
 #define CONST_ARGS5(Dtype, name) CONST_ARGS4(Dtype, name), CONST_ARGS(Dtype, name, 5)
+#define CONST_ARGS6(Dtype, name) CONST_ARGS5(Dtype, name), CONST_ARGS(Dtype, name, 6)
+#define CONST_ARGS7(Dtype, name) CONST_ARGS6(Dtype, name), CONST_ARGS(Dtype, name, 7)
 
 #define ALLTOALLV_ARGS(Dtype, b) \
     __global Dtype *in_buf##b, __global Dtype *out_buf##b, unsigned long count##b,
 
+// ALLTOALLV_ARGS#: 2-16 args number, max case is 16 ranks
 #define ALLTOALLV_ARGS2(Dtype) ALLTOALLV_ARGS(Dtype, 0) ALLTOALLV_ARGS(Dtype, 1)
 #define ALLTOALLV_ARGS4(Dtype) \
     ALLTOALLV_ARGS2(Dtype) ALLTOALLV_ARGS(Dtype, 2) ALLTOALLV_ARGS(Dtype, 3)
@@ -47,6 +54,10 @@ __kernel void empty_kernel(int my_rank,
     ALLTOALLV_ARGS8(Dtype) ALLTOALLV_ARGS(Dtype, 8) ALLTOALLV_ARGS(Dtype, 9)
 #define ALLTOALLV_ARGS12(Dtype) \
     ALLTOALLV_ARGS10(Dtype) ALLTOALLV_ARGS(Dtype, 10) ALLTOALLV_ARGS(Dtype, 11)
+#define ALLTOALLV_ARGS14(Dtype) \
+    ALLTOALLV_ARGS12(Dtype) ALLTOALLV_ARGS(Dtype, 12) ALLTOALLV_ARGS(Dtype, 13)
+#define ALLTOALLV_ARGS16(Dtype) \
+    ALLTOALLV_ARGS14(Dtype) ALLTOALLV_ARGS(Dtype, 14) ALLTOALLV_ARGS(Dtype, 15)
 
 #define ALLTOALLV_COPY(b) \
     for (size_t idx = thread_id; idx < count##b; idx += work_group_size) { \
@@ -58,18 +69,24 @@ __kernel void empty_kernel(int my_rank,
         dst##b[idx] = src##b[idx]; \
     }
 
+// ALLTOALLV_COPY#: 2-16 args number, max case is 16 ranks
 #define ALLTOALLV_COPY2  ALLTOALLV_COPY(0) ALLTOALLV_COPY(1)
 #define ALLTOALLV_COPY4  ALLTOALLV_COPY2 ALLTOALLV_COPY(2) ALLTOALLV_COPY(3)
 #define ALLTOALLV_COPY6  ALLTOALLV_COPY4 ALLTOALLV_COPY(4) ALLTOALLV_COPY(5)
 #define ALLTOALLV_COPY8  ALLTOALLV_COPY6 ALLTOALLV_COPY(6) ALLTOALLV_COPY(7)
 #define ALLTOALLV_COPY10 ALLTOALLV_COPY8 ALLTOALLV_COPY(8) ALLTOALLV_COPY(9)
 #define ALLTOALLV_COPY12 ALLTOALLV_COPY10 ALLTOALLV_COPY(10) ALLTOALLV_COPY(11)
+#define ALLTOALLV_COPY14 ALLTOALLV_COPY12 ALLTOALLV_COPY(12) ALLTOALLV_COPY(13)
+#define ALLTOALLV_COPY16 ALLTOALLV_COPY14 ALLTOALLV_COPY(14) ALLTOALLV_COPY(15)
 
+// BUFFER_COPY#: 1-7 args number, max case is 16 ranks
 #define BUFFER_COPY1(dst, src) BUFFER_COPY(dst, src, 1)
 #define BUFFER_COPY2(dst, src) BUFFER_COPY1(dst, src) BUFFER_COPY(dst, src, 2)
 #define BUFFER_COPY3(dst, src) BUFFER_COPY2(dst, src) BUFFER_COPY(dst, src, 3)
 #define BUFFER_COPY4(dst, src) BUFFER_COPY3(dst, src) BUFFER_COPY(dst, src, 4)
 #define BUFFER_COPY5(dst, src) BUFFER_COPY4(dst, src) BUFFER_COPY(dst, src, 5)
+#define BUFFER_COPY6(dst, src) BUFFER_COPY5(dst, src) BUFFER_COPY(dst, src, 6)
+#define BUFFER_COPY7(dst, src) BUFFER_COPY6(dst, src) BUFFER_COPY(dst, src, 7)
 
 #define DEFINE_ALLTOALLV_KERNEL(DtypeName, Dtype, OpName, OpFunc, N) \
     __kernel void alltoallv_kernel_##N##_##DtypeName##_##OpName( \
@@ -83,11 +100,14 @@ __kernel void empty_kernel(int my_rank,
 #define REDUCTION(OpFunc, b) \
     xelink_tmp_buf##b[idx] = OpFunc(local_send_buf##b[idx], mdfi_buf##b[idx]);
 
+// REDUCTION#: 1-7 args number, max case is 16 ranks
 #define REDUCTION1(OpFunc) REDUCTION(OpFunc, 0)
 #define REDUCTION2(OpFunc) REDUCTION1(OpFunc) REDUCTION(OpFunc, 1)
 #define REDUCTION3(OpFunc) REDUCTION2(OpFunc) REDUCTION(OpFunc, 2)
 #define REDUCTION4(OpFunc) REDUCTION3(OpFunc) REDUCTION(OpFunc, 3)
 #define REDUCTION5(OpFunc) REDUCTION4(OpFunc) REDUCTION(OpFunc, 4)
+#define REDUCTION6(OpFunc) REDUCTION5(OpFunc) REDUCTION(OpFunc, 5)
+#define REDUCTION7(OpFunc) REDUCTION6(OpFunc) REDUCTION(OpFunc, 6)
 
 // reduction for local_reduce
 #define FIRST_REDUCE(OpFunc, b0, b1) \
@@ -95,11 +115,14 @@ __kernel void empty_kernel(int my_rank,
 
 #define REDUCE(OpFunc, b) output_buf[idx] = OpFunc(output_buf[idx], xelink_tmp_buf##b[idx]);
 
+// REDUCE#: 1-7 args number, max case is 16 ranks
 #define REDUCE1(OpFunc) FIRST_REDUCE(OpFunc, 0, 1)
 #define REDUCE2(OpFunc) REDUCE1(OpFunc) REDUCE(OpFunc, 2)
 #define REDUCE3(OpFunc) REDUCE2(OpFunc) REDUCE(OpFunc, 3)
 #define REDUCE4(OpFunc) REDUCE3(OpFunc) REDUCE(OpFunc, 4)
 #define REDUCE5(OpFunc) REDUCE4(OpFunc) REDUCE(OpFunc, 5)
+#define REDUCE6(OpFunc) REDUCE5(OpFunc) REDUCE(OpFunc, 6)
+#define REDUCE7(OpFunc) REDUCE6(OpFunc) REDUCE(OpFunc, 7)
 
 #define DEFINE_REDUCE_READ_WRITE_KERNEL(DtypeName, Dtype, OpName, OpFunc, N) \
     __kernel void reduce_read_write_kernel_##N##_##DtypeName##_##OpName( \
@@ -197,6 +220,7 @@ __kernel void empty_kernel(int my_rank,
         } \
     }
 
+// DEFINE_REDUCE_MONOLITHIC_<n>_KERNEL: 1-7 kernels, max case is 16 ranks
 #define DEFINE_REDUCE_MONOLITHIC_1_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
     __kernel void reduce_monolithic_kernel_1_##DtypeName##_##OpName( \
         ulong count, \
@@ -297,6 +321,61 @@ __kernel void empty_kernel(int my_rank,
         } \
     }
 
+#define DEFINE_REDUCE_MONOLITHIC_6_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_6_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        const __global Dtype* peer_buffer2, \
+        const __global Dtype* peer_buffer3, \
+        const __global Dtype* peer_buffer4, \
+        const __global Dtype* peer_buffer5, \
+        const __global Dtype* peer_buffer6, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_6\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            sum = OpFunc(sum, peer_buffer2[idx]); \
+            sum = OpFunc(sum, peer_buffer3[idx]); \
+            sum = OpFunc(sum, peer_buffer4[idx]); \
+            sum = OpFunc(sum, peer_buffer5[idx]); \
+            sum = OpFunc(sum, peer_buffer6[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+#define DEFINE_REDUCE_MONOLITHIC_7_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void reduce_monolithic_kernel_7_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        const __global Dtype* peer_buffer1, \
+        const __global Dtype* peer_buffer2, \
+        const __global Dtype* peer_buffer3, \
+        const __global Dtype* peer_buffer4, \
+        const __global Dtype* peer_buffer5, \
+        const __global Dtype* peer_buffer6, \
+        const __global Dtype* peer_buffer7, \
+        __global Dtype* output_buffer) { \
+        DEBUG_BLOCK(printf("in reduce_monolithic_kernel_7\n")); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            Dtype sum = input_buffer[idx]; \
+            sum = OpFunc(sum, peer_buffer1[idx]); \
+            sum = OpFunc(sum, peer_buffer2[idx]); \
+            sum = OpFunc(sum, peer_buffer3[idx]); \
+            sum = OpFunc(sum, peer_buffer4[idx]); \
+            sum = OpFunc(sum, peer_buffer5[idx]); \
+            sum = OpFunc(sum, peer_buffer6[idx]); \
+            sum = OpFunc(sum, peer_buffer7[idx]); \
+            output_buffer[idx] = sum; \
+        } \
+    }
+
+// DEFINE_WRITE_MONOLITHIC_<n>_KERNEL: 1-7 kernels, max case is 16 ranks
 #define DEFINE_WRITE_MONOLITHIC_1_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
     __kernel void write_monolithic_kernel_1_##DtypeName##_##OpName( \
         ulong count, const __global Dtype* input_buffer, __global Dtype* peer_buffer1) { \
@@ -385,6 +464,56 @@ __kernel void empty_kernel(int my_rank,
         } \
     }
 
+#define DEFINE_WRITE_MONOLITHIC_6_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_6_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* peer_buffer1, \
+        __global Dtype* peer_buffer2, \
+        __global Dtype* peer_buffer3, \
+        __global Dtype* peer_buffer4, \
+        __global Dtype* peer_buffer5, \
+        __global Dtype* peer_buffer6) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_6 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+            peer_buffer2[idx] = val; \
+            peer_buffer3[idx] = val; \
+            peer_buffer4[idx] = val; \
+            peer_buffer5[idx] = val; \
+            peer_buffer6[idx] = val; \
+        } \
+    }
+
+#define DEFINE_WRITE_MONOLITHIC_7_KERNEL(DtypeName, Dtype, OpName, OpFunc) \
+    __kernel void write_monolithic_kernel_7_##DtypeName##_##OpName( \
+        ulong count, \
+        const __global Dtype* input_buffer, \
+        __global Dtype* peer_buffer1, \
+        __global Dtype* peer_buffer2, \
+        __global Dtype* peer_buffer3, \
+        __global Dtype* peer_buffer4, \
+        __global Dtype* peer_buffer5, \
+        __global Dtype* peer_buffer6, \
+        __global Dtype* peer_buffer7) { \
+        DEBUG_BLOCK(printf("in write_monolithic_kernel_7 count %d\n", count)); \
+        const size_t work_group_size = get_global_size(0); \
+        const size_t thread_id = get_global_id(0); \
+        for (size_t idx = thread_id; idx < count; idx += work_group_size) { \
+            const Dtype val = input_buffer[idx]; \
+            peer_buffer1[idx] = val; \
+            peer_buffer2[idx] = val; \
+            peer_buffer3[idx] = val; \
+            peer_buffer4[idx] = val; \
+            peer_buffer5[idx] = val; \
+            peer_buffer6[idx] = val; \
+            peer_buffer7[idx] = val; \
+        } \
+    }
+
 // Monolithic kernel reads data from buffers in Xelink peers and then writes it to buffers in MDFI peer
 #define DEFINE_READ_WRITE_MONOLITHIC_KERNEL(DtypeName, Dtype, OpName, OpFunc, N) \
     __kernel void read_write_monolithic_kernel_##N##_##DtypeName##_##OpName( \
@@ -512,14 +641,18 @@ DEFINE_FP16OPS(half)
     DEFINE_ALL_KERNELS_N(KernelName, 6) \
     DEFINE_ALL_KERNELS_N(KernelName, 8) \
     DEFINE_ALL_KERNELS_N(KernelName, 10) \
-    DEFINE_ALL_KERNELS_N(KernelName, 12)
+    DEFINE_ALL_KERNELS_N(KernelName, 12) \
+    DEFINE_ALL_KERNELS_N(KernelName, 14) \
+    DEFINE_ALL_KERNELS_N(KernelName, 16)
 
 #define DEFINE_ALL_KERNELS_PEERS_PLANE(KernelName) \
     DEFINE_ALL_KERNELS_N(KernelName, 1) \
     DEFINE_ALL_KERNELS_N(KernelName, 2) \
     DEFINE_ALL_KERNELS_N(KernelName, 3) \
     DEFINE_ALL_KERNELS_N(KernelName, 4) \
-    DEFINE_ALL_KERNELS_N(KernelName, 5)
+    DEFINE_ALL_KERNELS_N(KernelName, 5) \
+    DEFINE_ALL_KERNELS_N(KernelName, 6) \
+    DEFINE_ALL_KERNELS_N(KernelName, 7)
 
 #define DEFINE_ALL_KERNELS_OP_N(KernelName, N) \
     DEFINE_KERNELS_WITH_OP_N(KernelName, sum, N) \
@@ -537,7 +670,9 @@ DEFINE_FP16OPS(half)
     DEFINE_ALL_KERNELS_OP_N(KernelName, 2) \
     DEFINE_ALL_KERNELS_OP_N(KernelName, 3) \
     DEFINE_ALL_KERNELS_OP_N(KernelName, 4) \
-    DEFINE_ALL_KERNELS_OP_N(KernelName, 5)
+    DEFINE_ALL_KERNELS_OP_N(KernelName, 5) \
+    DEFINE_ALL_KERNELS_OP_N(KernelName, 6) \
+    DEFINE_ALL_KERNELS_OP_N(KernelName, 7)
 
 DEFINE_ALL_KERNELS_PEERS(ALLTOALLV)
 DEFINE_ALL_KERNELS_PEERS_PLANE(READ_WRITE_MONOLITHIC)
@@ -553,15 +688,21 @@ DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_2)
 DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_3)
 DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_4)
 DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_5)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_6)
+DEFINE_ALL_KERNELS(REDUCE_MONOLITHIC_7)
 
 DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_1, custom)
 DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_2, custom)
 DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_3, custom)
 DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_4, custom)
 DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_5, custom)
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_6, custom)
+DEFINE_KERNELS_WITH_OP(WRITE_MONOLITHIC_7, custom)
 
 DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_1, custom)
 DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_2, custom)
 DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_3, custom)
 DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_4, custom)
 DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_5, custom)
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_6, custom)
+DEFINE_KERNELS_WITH_LP_OP(WRITE_MONOLITHIC_7, custom)
diff --git a/src/kernels/kernels.spv b/src/kernels/kernels.spv
index e7b2737f6..1c6ba7e8d 100644
Binary files a/src/kernels/kernels.spv and b/src/kernels/kernels.spv differ
diff --git a/src/parallelizer/parallelizer.cpp b/src/parallelizer/parallelizer.cpp
index 600adf95d..985d8cdc8 100644
--- a/src/parallelizer/parallelizer.cpp
+++ b/src/parallelizer/parallelizer.cpp
@@ -34,7 +34,7 @@ ccl::status ccl_parallelizer::process(ccl_sched* sched, bool update_sched_id) {
 #ifdef CCL_ENABLE_SYCL
     ccl_coll_param& param = sched->coll_param;
     if (param.stream && param.stream->is_sycl_device_stream() &&
-        (!param.device_send_bufs.empty() || !param.device_recv_bufs.empty())) {
+        (!param.send_dev_bufs.empty() || !param.recv_dev_bufs.empty())) {
         process_pre_post_copies(sched);
     }
     process_output_event(sched);
@@ -83,14 +83,17 @@ ccl::status ccl_parallelizer::process_pre_post_copies(ccl_sched* sched) {
 
     if ((coll_type == ccl_coll_allgatherv) &&
         coll_param.is_inplace(ccl_coll_param::buf_type::device) &&
-        (coll_param.device_recv_bufs.size() == 1)) {
-        device_in_buf_offset = std::accumulate(
-            coll_param.recv_counts.begin(), coll_param.recv_counts.begin() + my_rank, 0);
+        (coll_param.recv_dev_bufs.size() == 1)) {
+        device_in_buf_offset = std::accumulate(coll_param.recv_counts.begin(),
+                                               coll_param.recv_counts.begin() + my_rank,
+                                               ccl::utils::initial_count_value);
         LOG_TRACE("device_in_buf_offset = ", device_in_buf_offset);
     }
 
-    size_t total_d2h_count = std::accumulate(d2h_counts.begin(), d2h_counts.end(), 0);
-    size_t total_h2d_count = std::accumulate(h2d_counts.begin(), h2d_counts.end(), 0);
+    size_t total_d2h_count =
+        std::accumulate(d2h_counts.begin(), d2h_counts.end(), ccl::utils::initial_count_value);
+    size_t total_h2d_count =
+        std::accumulate(h2d_counts.begin(), h2d_counts.end(), ccl::utils::initial_count_value);
 
     if (total_d2h_count) {
         for (size_t idx = 0; idx < sched_count; idx++) {
@@ -340,6 +343,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
         part_coll_param.ctype = ccl_coll_partial;
         part_coll_param.stream = sched->coll_param.stream;
         part_coll_param.comm = comm;
+        part_coll_param.is_pt2pt = sched->coll_param.is_pt2pt;
         sched->add_subsched(part_coll_param, update_sched_id);
     }
 
@@ -368,10 +372,12 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
         case ccl_coll_alltoall:
         case ccl_coll_alltoallv:
             if (coll_type == ccl_coll_alltoallv) {
-                a2av_send_count = std::accumulate(
-                    coll_param.send_counts.begin(), coll_param.send_counts.end(), 0);
-                a2av_recv_count = std::accumulate(
-                    coll_param.recv_counts.begin(), coll_param.recv_counts.end(), 0);
+                a2av_send_count = std::accumulate(coll_param.send_counts.begin(),
+                                                  coll_param.send_counts.end(),
+                                                  ccl::utils::initial_count_value);
+                a2av_recv_count = std::accumulate(coll_param.recv_counts.begin(),
+                                                  coll_param.recv_counts.end(),
+                                                  ccl::utils::initial_count_value);
             }
             else {
                 a2av_send_count = coll_param.get_send_count() * comm_size;
@@ -395,8 +401,9 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                     offsets[idx] = offsets[idx - 1] + counts[idx - 1] * dtype_size;
                 }
             }
-            ag_recv_count =
-                std::accumulate(coll_param.recv_counts.begin(), coll_param.recv_counts.end(), 0);
+            ag_recv_count = std::accumulate(coll_param.recv_counts.begin(),
+                                            coll_param.recv_counts.end(),
+                                            ccl::utils::initial_count_value);
             ag_recv_bytes = ag_recv_count * dtype_size;
             break;
         case ccl_coll_recv:
@@ -422,7 +429,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
         case ccl_coll_barrier:
             sched->sync_subscheds();
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = ccl_coll_barrier;
                 param.dtype = ccl_datatype_int8;
                 param.comm = comm;
@@ -432,7 +439,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
             break;
         case ccl_coll_bcast:
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = ccl_coll_bcast;
                 param.recv_buf = ccl_buffer(coll_param.get_recv_buf_ptr(),
                                             coll_param.get_recv_count() * dtype_size,
@@ -449,7 +456,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
 
         case ccl_coll_reduce:
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = ccl_coll_reduce;
                 param.send_buf = ccl_buffer(coll_param.get_send_buf_ptr(),
                                             coll_param.get_send_count() * dtype_size,
@@ -472,7 +479,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
 
         case ccl_coll_reduce_scatter:
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = ccl_coll_reduce_scatter;
 
                 bool inplace = coll_param.is_inplace();
@@ -499,7 +506,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
 
         case ccl_coll_allreduce: {
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = ccl_coll_allreduce;
                 param.send_buf = ccl_buffer(coll_param.get_send_buf_ptr(),
                                             coll_param.get_send_count() * dtype_size,
@@ -523,7 +530,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
         case ccl_coll_allgatherv: {
             if (algo.allgatherv == ccl_coll_allgatherv_direct ||
                 algo.allgatherv == ccl_coll_allgatherv_naive) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = ccl_coll_allgatherv;
                 param.send_buf = ccl_buffer(coll_param.get_send_buf_ptr(),
                                             coll_param.get_send_count() * dtype_size,
@@ -531,7 +538,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                 param.recv_buf = ccl_buffer(
                     coll_param.get_recv_buf_ptr(), ag_recv_bytes, ccl_buffer_type::INDIRECT);
                 param.send_count = coll_param.get_send_count();
-                param.recv_counts = coll_param.recv_counts.data();
+                param.recv_counts = coll_param.recv_counts;
                 param.dtype = dtype;
                 param.comm = comm;
                 param.stream = coll_param.stream;
@@ -590,7 +597,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
             }
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
             else {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{ false };
                 param.ctype = coll_type;
                 param.send_buf = ccl_buffer(
                     coll_param.get_send_buf_ptr(), a2av_send_bytes, ccl_buffer_type::INDIRECT);
@@ -605,8 +612,8 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                     ccl::add_coll_entry(part_scheds[0].get(), param);
                 }
                 else {
-                    param.send_counts = coll_param.send_counts.data();
-                    param.recv_counts = coll_param.recv_counts.data();
+                    param.send_counts = coll_param.send_counts;
+                    param.recv_counts = coll_param.recv_counts;
                     ccl::add_coll_entry(part_scheds[0].get(), param);
                 }
             }
@@ -615,7 +622,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
 
         case ccl_coll_recv:
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{};
                 param.ctype = ccl_coll_recv;
                 param.recv_buf = ccl_buffer(coll_param.get_recv_buf_ptr(),
                                             coll_param.get_recv_count() * dtype_size,
@@ -626,13 +633,14 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                 param.peer_rank = coll_param.peer_rank;
                 param.comm = comm;
                 param.stream = coll_param.stream;
+                param.is_pt2pt = true;
                 ccl::add_coll_entry(part_scheds[idx].get(), param);
             }
             break;
 
         case ccl_coll_send:
             for (idx = 0; idx < part_count; idx++) {
-                ccl_coll_entry_param param{};
+                ccl_coll_param param{};
                 param.ctype = ccl_coll_send;
                 param.send_buf = ccl_buffer(coll_param.get_send_buf_ptr(),
                                             coll_param.get_send_count() * dtype_size,
@@ -643,6 +651,7 @@ ccl::status ccl_parallelizer::process_base(ccl_sched* sched, bool update_sched_i
                 param.peer_rank = coll_param.peer_rank;
                 param.comm = comm;
                 param.stream = coll_param.stream;
+                param.is_pt2pt = true;
                 ccl::add_coll_entry(part_scheds[idx].get(), param);
             }
             break;
diff --git a/src/sched/buffer/buffer_manager.cpp b/src/sched/buffer/buffer_manager.cpp
index 44d96e37c..8898c61b2 100644
--- a/src/sched/buffer/buffer_manager.cpp
+++ b/src/sched/buffer/buffer_manager.cpp
@@ -18,7 +18,7 @@
 #include "sched/buffer/buffer_manager.hpp"
 
 #ifdef CCL_ENABLE_ZE
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #endif // CCL_ENABLE_ZE
 
 namespace ccl {
diff --git a/src/sched/cache/cache.hpp b/src/sched/cache/cache.hpp
index 22b986f29..5624012f6 100644
--- a/src/sched/cache/cache.hpp
+++ b/src/sched/cache/cache.hpp
@@ -18,6 +18,7 @@
 #include "common/utils/spinlock.hpp"
 #include "sched/cache/key.hpp"
 #include "sched/sched.hpp"
+#include "sched/sched_timer.hpp"
 
 #include <atomic>
 #include <functional>
@@ -44,7 +45,7 @@ class ccl_sched_cache {
     ccl_sched_cache(const ccl_sched_cache& other) = delete;
     ccl_sched_cache& operator=(const ccl_sched_cache& other) = delete;
     template <class Lambda>
-    std::pair<ccl_sched*, bool> find_or_create(ccl_sched_key&& key, Lambda create_fn);
+    std::pair<ccl_sched*, bool> find_or_create(ccl_sched_key&& key, const Lambda& create_fn);
     void recache(const ccl_sched_key& old_key, ccl_sched_key&& new_key);
     void release(ccl_sched* sched);
     bool try_flush();
@@ -60,16 +61,30 @@ class ccl_sched_cache {
 };
 
 template <class Lambda>
-std::pair<ccl_sched*, bool> ccl_sched_cache::find_or_create(ccl_sched_key&& key, Lambda create_fn) {
+/// create_fn lmbda is NOT copied internally or used after function exit
+std::pair<ccl_sched*, bool> ccl_sched_cache::find_or_create(ccl_sched_key&& key,
+                                                            const Lambda& create_fn) {
     ccl_sched* sched = nullptr;
+
     bool is_created = false;
     {
         std::lock_guard<sched_cache_lock_t> lock{ guard };
         sched = find_unsafe(key);
         if (sched) {
+#ifdef CCL_ENABLE_ITT
+            __itt_event sched_cached_event = ccl::profile::itt::event_get("SCHED_CACHED");
+            ccl::profile::itt::event_start(sched_cached_event);
+#endif // CCL_ENABLE_ITT
             reference_counter++;
+#ifdef CCL_ENABLE_ITT
+            ccl::profile::itt::event_end(sched_cached_event);
+#endif // CCL_ENABLE_ITT
         }
         else {
+#ifdef CCL_ENABLE_ITT
+            __itt_event sched_new_event = ccl::profile::itt::event_get("SCHED_NEW");
+            ccl::profile::itt::event_start(sched_new_event);
+#endif // CCL_ENABLE_ITT
             LOG_DEBUG("didn't find sched in cache, the new one will be created");
             sched = create_fn();
             {
@@ -87,6 +102,9 @@ std::pair<ccl_sched*, bool> ccl_sched_cache::find_or_create(ccl_sched_key&& key,
                       table.load_factor(),
                       ", max_load_factor ",
                       table.max_load_factor());
+#ifdef CCL_ENABLE_ITT
+            ccl::profile::itt::event_end(sched_new_event);
+#endif // CCL_ENABLE_ITT
         }
     }
     LOG_TRACE("reference_counter=", reference_counter);
diff --git a/src/sched/cache/key.cpp b/src/sched/cache/key.cpp
index 462369412..e76b0a4a3 100644
--- a/src/sched/cache/key.cpp
+++ b/src/sched/cache/key.cpp
@@ -181,8 +181,8 @@ size_t ccl_sched_key_hasher::operator()(const ccl_sched_key& k) const {
     if (ccl::global_data::env().cache_key_type == ccl_cache_key_full) {
         /* TODO: improve hashing for vec fields to reduce probability of collisions
            e.g. sum(a[idx]*(idx+1)) */
-        size_t vec1_sum = std::accumulate(k.vec1.begin(), k.vec1.end(), 0);
-        size_t vec2_sum = std::accumulate(k.vec2.begin(), k.vec2.end(), 0);
+        size_t vec1_sum = std::accumulate(k.vec1.begin(), k.vec1.end(), size_t(0));
+        size_t vec2_sum = std::accumulate(k.vec2.begin(), k.vec2.end(), size_t(0));
         hash_value += k.f.ctype + ccl::utils::enum_to_underlying(k.f.dtype) +
                       ccl::utils::enum_to_underlying(k.f.reduction) + k.f.count1 + k.f.count2 +
                       k.f.root + (size_t)k.f.buf1 + (size_t)k.f.buf2 + (size_t)k.f.comm +
diff --git a/src/sched/entry/coll/coll_entry.cpp b/src/sched/entry/coll/coll_entry.cpp
index 3ad6ffaaf..6de4c8973 100644
--- a/src/sched/entry/coll/coll_entry.cpp
+++ b/src/sched/entry/coll/coll_entry.cpp
@@ -14,19 +14,25 @@
  limitations under the License.
 */
 #include "sched/entry/coll/coll_entry.hpp"
+#include "sched/sched_timer.hpp"
 
-ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_entry_param& param) {
+ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_param& param) {
     ccl::status res = ccl::status::success;
 
     sched->hint_algo = param.hint_algo;
 
+#ifdef CCL_ENABLE_ITT
+    __itt_event build_sched_event = ccl::profile::itt::event_get("BUILD_SCHED");
+    ccl::profile::itt::event_start(build_sched_event);
+#endif // CCL_ENABLE_ITT
+
     switch (param.ctype) {
         case ccl_coll_allgatherv: {
             res = ccl_coll_build_allgatherv(sched,
                                             param.send_buf,
                                             param.send_count,
                                             param.recv_buf,
-                                            param.recv_counts,
+                                            param.recv_counts.data(),
                                             param.dtype,
                                             param.comm,
                                             param.is_scaleout);
@@ -56,9 +62,9 @@ ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_entry_param
         case ccl_coll_alltoallv: {
             res = ccl_coll_build_alltoallv(sched,
                                            param.send_buf,
-                                           param.send_counts,
+                                           param.send_counts.data(),
                                            param.recv_buf,
-                                           param.recv_counts,
+                                           param.recv_counts.data(),
                                            param.dtype,
                                            param.comm,
                                            param.is_scaleout);
@@ -92,7 +98,8 @@ ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_entry_param
                                                 param.count,
                                                 param.dtype,
                                                 param.reduction,
-                                                param.comm);
+                                                param.comm,
+                                                param.is_scaleout);
             break;
         }
         case ccl_coll_recv: {
@@ -107,6 +114,11 @@ ccl::status coll_entry::build_sched(ccl_sched* sched, const ccl_coll_entry_param
         }
         default: CCL_FATAL("not supported coll_type ", param.ctype); break;
     }
+
+#ifdef CCL_ENABLE_ITT
+    ccl::profile::itt::event_end(build_sched_event);
+#endif // CCL_ENABLE_ITT
+
     return res;
 }
 
@@ -121,6 +133,7 @@ void coll_entry::start() {
         coll_param.ctype = sched->coll_param.ctype;
         coll_param.comm = sched->coll_param.comm;
         coll_param.stream = sched->coll_param.stream;
+        coll_param.is_pt2pt = sched->coll_param.is_pt2pt;
 
         LOG_DEBUG("building COLL entry: ",
                   this,
diff --git a/src/sched/entry/coll/coll_entry.hpp b/src/sched/entry/coll/coll_entry.hpp
index f771db5db..0c82da29c 100644
--- a/src/sched/entry/coll/coll_entry.hpp
+++ b/src/sched/entry/coll/coll_entry.hpp
@@ -17,7 +17,6 @@
 
 #include "common/global/global.hpp"
 #include "comp/comp.hpp"
-#include "sched/entry/coll/coll_entry_param.hpp"
 #include "sched/entry/subsched_entry.hpp"
 
 class coll_entry : public subsched_entry,
@@ -33,7 +32,7 @@ class coll_entry : public subsched_entry,
     }
 
     coll_entry() = delete;
-    coll_entry(ccl_sched* sched, const ccl_coll_entry_param& param, ccl_op_id_t op_id = 0)
+    coll_entry(ccl_sched* sched, const ccl_coll_param& param, ccl_op_id_t op_id = 0)
             : subsched_entry(
                   sched,
                   op_id,
@@ -79,7 +78,7 @@ class coll_entry : public subsched_entry,
         return param.send_count;
     }
 
-    static ccl::status build_sched(ccl_sched* sched, const ccl_coll_entry_param& param);
+    static ccl::status build_sched(ccl_sched* sched, const ccl_coll_param& param);
 
 protected:
     void dump_detail(std::stringstream& str) const override {
@@ -102,5 +101,5 @@ class coll_entry : public subsched_entry,
     }
 
 private:
-    ccl_coll_entry_param param;
+    ccl_coll_param param;
 };
diff --git a/src/sched/entry/coll/direct/alltoall_entry.hpp b/src/sched/entry/coll/direct/alltoall_entry.hpp
index 32bb74505..fd09a3dee 100644
--- a/src/sched/entry/coll/direct/alltoall_entry.hpp
+++ b/src/sched/entry/coll/direct/alltoall_entry.hpp
@@ -89,11 +89,11 @@ class alltoall_entry : public base_coll_entry {
     }
 
 private:
-    ccl_buffer send_buf;
-    ccl_buffer recv_buf;
-    size_t cnt;
-    int bytes;
-    ccl_datatype dtype;
-    ccl_comm* comm;
+    ccl_buffer send_buf{};
+    ccl_buffer recv_buf{};
+    size_t cnt{ ccl::utils::initial_count_value };
+    int bytes{ ccl::utils::invalid_bytes_value };
+    ccl_datatype dtype{};
+    ccl_comm* comm{};
     atl_req_t req{};
 };
diff --git a/src/sched/entry/coll/direct/barrier_entry.hpp b/src/sched/entry/coll/direct/barrier_entry.hpp
index 6d85feef4..b001b9826 100644
--- a/src/sched/entry/coll/direct/barrier_entry.hpp
+++ b/src/sched/entry/coll/direct/barrier_entry.hpp
@@ -35,8 +35,9 @@ class barrier_entry : public base_coll_entry {
         if (unlikely(atl_status != ATL_STATUS_SUCCESS)) {
             CCL_THROW("BARRIER entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
-        else
+        else {
             status = ccl_sched_entry_status_started;
+        }
     }
 
     void update() override {
@@ -46,8 +47,9 @@ class barrier_entry : public base_coll_entry {
             CCL_THROW("BARRIER entry failed. atl_status: ", atl_status_to_str(atl_status));
         }
 
-        if (req.is_completed)
+        if (req.is_completed) {
             status = ccl_sched_entry_status_complete;
+        }
     }
 
     const char* name() const override {
diff --git a/src/sched/entry/copy/copy_helper.cpp b/src/sched/entry/copy/copy_helper.cpp
index cfe1814c2..5704879e2 100644
--- a/src/sched/entry/copy/copy_helper.cpp
+++ b/src/sched/entry/copy/copy_helper.cpp
@@ -20,6 +20,7 @@ copy_attr::copy_attr()
         : peer_rank(ccl_comm::invalid_rank),
           peer_buf_idx(0),
           direction(copy_direction::undefined),
+          pt2pt_op(false),
           map_comm(nullptr),
           in_buf_offset(0),
           out_buf_offset(0),
@@ -34,15 +35,30 @@ copy_attr::copy_attr()
 copy_attr::copy_attr(int peer_rank,
                      size_t peer_buf_idx,
                      copy_direction direction,
+                     bool pt2pt_op,
                      ccl_comm* map_comm,
                      size_t in_buf_offset,
-                     size_t out_buf_offset)
+                     size_t out_buf_offset,
+                     bool use_nontemporal
+#ifdef CCL_ENABLE_ZE
+                     ,
+                     int hint_queue_index
+#endif // CCL_ENABLE_ZE
+                     )
         : peer_rank(peer_rank),
           peer_buf_idx(peer_buf_idx),
           direction(direction),
+          pt2pt_op(pt2pt_op),
           map_comm(map_comm),
           in_buf_offset(in_buf_offset),
-          out_buf_offset(out_buf_offset) {}
+          out_buf_offset(out_buf_offset),
+          use_nontemporal(use_nontemporal)
+#ifdef CCL_ENABLE_ZE
+          ,
+          hint_queue_index(hint_queue_index)
+#endif // CCL_ENABLE_ZE
+{
+}
 
 copy_attr::copy_attr(copy_direction direction, size_t in_buf_offset, size_t out_buf_offset)
         : direction(direction),
diff --git a/src/sched/entry/copy/copy_helper.hpp b/src/sched/entry/copy/copy_helper.hpp
index 4363d5c23..8179d03c5 100644
--- a/src/sched/entry/copy/copy_helper.hpp
+++ b/src/sched/entry/copy/copy_helper.hpp
@@ -30,25 +30,33 @@ std::string to_string(copy_direction val);
 class ccl_comm;
 
 struct copy_attr {
-    int peer_rank;
-    size_t peer_buf_idx;
+    int peer_rank = ccl::utils::invalid_peer_rank;
+    size_t peer_buf_idx = 0;
     copy_direction direction;
-    ccl_comm* map_comm;
-    size_t in_buf_offset;
-    size_t out_buf_offset;
-    bool use_nontemporal;
+    bool pt2pt_op = false;
+    ccl_comm* map_comm = nullptr;
+    size_t in_buf_offset = 0;
+    size_t out_buf_offset = 0;
+    bool use_nontemporal = false;
 
 #ifdef CCL_ENABLE_ZE
-    int hint_queue_index;
+    int hint_queue_index = 0;
 #endif // CCL_ENABLE_ZE
 
     copy_attr();
     copy_attr(int peer_rank,
               size_t peer_buf_idx,
               copy_direction direction,
+              bool pt2pt_op = false,
               ccl_comm* map_comm = nullptr,
               size_t in_buf_offset = 0,
-              size_t out_buf_offset = 0);
+              size_t out_buf_offset = 0,
+              bool use_nontemporal = false
+#ifdef CCL_ENABLE_ZE
+              ,
+              int hint_queue_index = 0
+#endif // CCL_ENABLE_ZE
+    );
     copy_attr(copy_direction direction, size_t in_buf_offset = 0, size_t out_buf_offset = 0);
 };
 
@@ -146,7 +154,7 @@ struct sycl_copier {
                         h, count, in_buf_offset);
                     auto dst_buf_acc = dst_buf.template get_access<sycl::access::mode::write>(
                         h, count, out_buf_offset);
-                    h.copy(src_buf_acc, dst_buf_acc);
+                    h.copy(std::move(src_buf_acc), std::move(dst_buf_acc));
                 });
             }
             else {
@@ -174,14 +182,14 @@ struct sycl_copier {
     std::string get_dtype_name(const ccl_datatype& dt) const;
 
     copy_direction direction;
-    ccl_buffer in_buf;
-    ccl_buffer out_buf;
-    size_t count;
-    ccl_datatype dtype;
-    bool is_sycl_buf;
-    sycl::queue* q;
-    size_t in_buf_offset;
-    size_t out_buf_offset;
-    sycl::event e;
+    ccl_buffer in_buf{};
+    ccl_buffer out_buf{};
+    size_t count = 0;
+    ccl_datatype dtype{};
+    bool is_sycl_buf = false;
+    sycl::queue* q{ nullptr };
+    size_t in_buf_offset = 0;
+    size_t out_buf_offset = 0;
+    sycl::event e{};
 };
 #endif // CCL_ENABLE_SYCL
diff --git a/src/sched/entry/deps_entry.hpp b/src/sched/entry/deps_entry.hpp
index df6f840ac..92db5d9b7 100644
--- a/src/sched/entry/deps_entry.hpp
+++ b/src/sched/entry/deps_entry.hpp
@@ -4,6 +4,10 @@
 #include "common/utils/sycl_utils.hpp"
 #endif // CCL_ENABLE_SYCL
 
+#ifdef CCL_ENABLE_ITT
+#include "sched/sched_timer.hpp"
+#endif // CCL_ENABLE_ITT
+
 #include "sched/entry/entry.hpp"
 
 class deps_entry : public sched_entry {
@@ -33,22 +37,6 @@ class deps_entry : public sched_entry {
 
         if (all_completed) {
             status = ccl_sched_entry_status_complete;
-
-#ifdef CCL_ENABLE_ITT
-            // deps entry should be executed right at the beginning, so we can assume it's
-            // a start of operation execution
-            // due to issue with overlapping tasks we can't measure deps entry
-            // because it can start executing while the previous master sched
-            // is not completed yet. For now, start to measure operation when deps entry
-            // is completed and when there are no possible overlap.
-            ccl::profile::itt::task_start(ccl::profile::itt::task_type::operation);
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-            // only applicable for device execution
-            if (sched->coll_param.stream) {
-                ccl::profile::itt::task_start(ccl::profile::itt::task_type::preparation);
-            }
-#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
-#endif // CCL_ENABLE_ITT
         }
     }
 
diff --git a/src/sched/entry/entry.cpp b/src/sched/entry/entry.cpp
index c66325eec..fe88e89a1 100644
--- a/src/sched/entry/entry.cpp
+++ b/src/sched/entry/entry.cpp
@@ -62,6 +62,11 @@ void sched_entry::do_progress() {
             return;
         }
 
+#ifdef CCL_ENABLE_ITT
+        this->itt_event = ccl::profile::itt::event_get(this->name());
+        ccl::profile::itt::event_start(this->itt_event);
+#endif // CCL_ENABLE_ITT
+
         start();
         CCL_THROW_IF_NOT(status >= ccl_sched_entry_status_again,
                          "bad status ",
@@ -106,6 +111,10 @@ void sched_entry::do_progress() {
     }
 
     if (status == ccl_sched_entry_status_complete) {
+#ifdef CCL_ENABLE_ITT
+        ccl::profile::itt::event_end(this->itt_event);
+#endif // CCL_ENABLE_ITT
+
         if (use_total_timer) {
             total_timer.update();
         }
@@ -218,18 +227,25 @@ const ze_commands_t& sched_entry::get_ze_commands() const {
     return ze_commands;
 }
 
-void sched_entry::ze_commands_submit() {
+uint32_t sched_entry::ze_commands_submit() {
+    uint32_t cmd_counter = 0;
     LOG_DEBUG("entry ", name(), " ze_commands.size() ", ze_commands.size());
     for (auto& command : ze_commands) {
         LOG_DEBUG("adding command ", command->name(), " to command list");
         command->ze_call();
+        ++cmd_counter;
     }
     LOG_DEBUG("entry ", name(), " all commands submitted");
     // TODO: determine the effect of destroying the commands on the cache (e.g. kernel cache)
     ze_commands.clear();
+    return cmd_counter;
 }
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
+ccl_sched* sched_entry::get_sched() const {
+    return sched;
+}
+
 void sched_entry::set_exec_mode(ccl_sched_entry_exec_mode mode) {
     exec_mode = mode;
 }
diff --git a/src/sched/entry/entry.hpp b/src/sched/entry/entry.hpp
index e5655d4dd..aca67e882 100644
--- a/src/sched/entry/entry.hpp
+++ b/src/sched/entry/entry.hpp
@@ -102,9 +102,11 @@ class alignas(CACHELINE_SIZE) sched_entry {
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
     const ze_commands_t& get_ze_commands() const;
-    virtual void ze_commands_submit();
+    virtual uint32_t ze_commands_submit();
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
+    ccl_sched* get_sched() const;
+
 protected:
     virtual void start() = 0;
     virtual void update();
@@ -127,6 +129,10 @@ class alignas(CACHELINE_SIZE) sched_entry {
     bool use_update_timer = false;
     bool is_update_time_expired = false;
 
+#ifdef CCL_ENABLE_ITT
+    __itt_event itt_event = ccl::profile::itt::invalid_event;
+#endif // CCL_ENABLE_ITT
+
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
     ze_commands_t ze_commands;
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
diff --git a/src/sched/entry/recv_entry.hpp b/src/sched/entry/recv_entry.hpp
index 2a6e7b5c5..11c5b25b8 100644
--- a/src/sched/entry/recv_entry.hpp
+++ b/src/sched/entry/recv_entry.hpp
@@ -55,7 +55,7 @@ class recv_entry : public sched_entry,
 
         uint16_t sched_id = sched->sched_id;
         if (sched->coll_param.ctype == ccl_coll_recv) {
-            sched_id = ccl_comm::pt2pt_sched_id;
+            sched_id = comm->get_atl_comm()->tag_creator->get_pt2pt_sched_id();
         }
 
         atl_tag = comm->get_atl_comm()->tag_creator->create(
diff --git a/src/sched/entry/send_entry.hpp b/src/sched/entry/send_entry.hpp
index 538436472..19aae85da 100644
--- a/src/sched/entry/send_entry.hpp
+++ b/src/sched/entry/send_entry.hpp
@@ -73,7 +73,7 @@ class send_entry : public sched_entry,
     void start_send() {
         uint16_t sched_id = sched->sched_id;
         if (sched->coll_param.ctype == ccl_coll_send) {
-            sched_id = ccl_comm::pt2pt_sched_id;
+            sched_id = comm->get_atl_comm()->tag_creator->get_pt2pt_sched_id();
         }
 
         atl_tag = comm->get_atl_comm()->tag_creator->create(
diff --git a/src/sched/entry/subsched_entry.hpp b/src/sched/entry/subsched_entry.hpp
index 88b7f1b10..f6976ed27 100644
--- a/src/sched/entry/subsched_entry.hpp
+++ b/src/sched/entry/subsched_entry.hpp
@@ -46,7 +46,7 @@ class subsched_entry : public sched_entry {
 
     subsched_entry(ccl_sched* sched,
                    ccl_op_id_t op_id,
-                   ccl_sched_create_param sched_param,
+                   const ccl_sched_create_param& sched_param,
                    const char* subsched_name)
             : sched_entry(sched),
               coll_param(sched->coll_param),
@@ -77,6 +77,8 @@ class subsched_entry : public sched_entry {
     }
 
     void set_params() {
+        subsched->subsched_entry_parent_sched = sched;
+
         if (!is_master_sched) {
             subsched->set_op_id(op_id);
 
@@ -98,14 +100,16 @@ class subsched_entry : public sched_entry {
 
 #if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
     // Submits all ze commands that have been stored both in the entry or on the subsched
-    void ze_commands_submit() override {
+    uint32_t ze_commands_submit() override {
         LOG_DEBUG("entry ", name(), " calling parent ze_commands_submit");
-        sched_entry::ze_commands_submit();
+        uint32_t cmd_counter = sched_entry::ze_commands_submit();
 
         if (subsched) {
             LOG_DEBUG("entry ", name(), " calling subsched ze_commands_submit");
-            subsched->ze_commands_submit();
+            cmd_counter += subsched->ze_commands_submit();
         }
+
+        return cmd_counter;
     }
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
@@ -124,11 +128,11 @@ class subsched_entry : public sched_entry {
         if (is_master_sched) {
             ccl::global_data& data = ccl::global_data::get();
             bool update_sched_id = false;
+            CCL_THROW_IF_NOT(subsched, "master sched is null");
             subsched->start(data.executor.get(), true, update_sched_id);
         }
         else {
             build_subsched({ build_sched_id, sched->coll_param });
-            subsched->subsched_entry_parent_sched = sched;
             subsched->renew();
             subsched->get_request()->set_counter(1);
             subsched->bin = sched->bin;
diff --git a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
index 1e5875f37..124770703 100644
--- a/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_a2a_allreduce_entry.cpp
@@ -18,7 +18,7 @@
 #include "sched/entry/ze/allreduce/ze_a2a_allreduce_entry.hpp"
 #include "sched/entry/ze/ze_a2a_allgatherv_entry.hpp"
 #include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/queue/queue.hpp"
 
@@ -218,13 +218,11 @@ void ze_a2a_allreduce_entry::init_ze_hook() {
                                      peer_buf_offset,
                                      post_copy_events,
                                      kernel_events,
+                                     ze_base_entry::entry_event,
                                      is_monolithic_allgat,
                                      false, // is_inplace
                                      false); // is_separate_block_handles
     ze_a2a_allgatherv_op::select(init_params, kernels);
-
-    // wait for post_copy_events and signal entry_event
-    ZE_APPEND_CALL(ze_cmd_barrier, get_copy_list(), ze_base_entry::entry_event, post_copy_events);
 }
 
 void ze_a2a_allreduce_entry::start() {
diff --git a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
index d191fb6da..63176c3fa 100644
--- a/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
+++ b/src/sched/entry/ze/allreduce/ze_onesided_allreduce_entry.cpp
@@ -17,7 +17,7 @@
 #include "comp/comp.hpp"
 #include "sched/entry/ze/allreduce/ze_onesided_allreduce_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/queue/queue.hpp"
 
 #include <string>
@@ -53,14 +53,8 @@ void ze_onesided_allreduce_entry::init_ze_hook() {
 
     send_buf_ptr = send_buf.get_ptr();
     recv_buf_ptr = recv_buf.get_ptr();
-    if (send_buf_ptr == recv_buf_ptr) {
-        sched->get_memory().handle_manager.get(peer_rank, 1, right_send_buf, comm);
-        sched->get_memory().handle_manager.get(peer_rank, 1, right_recv_buf, comm);
-    }
-    else {
-        sched->get_memory().handle_manager.get(peer_rank, 0, right_send_buf, comm);
-        sched->get_memory().handle_manager.get(peer_rank, 1, right_recv_buf, comm);
-    }
+    sched->get_memory().handle_manager.get(peer_rank, 0, right_send_buf, comm);
+    sched->get_memory().handle_manager.get(peer_rank, 1, right_recv_buf, comm);
 
     right_send_buf_ptr = static_cast<char*>(right_send_buf.get_ptr()) + buf_offset_bytes;
     right_recv_buf_ptr = static_cast<char*>(right_recv_buf.get_ptr()) + buf_offset_bytes;
diff --git a/src/sched/entry/ze/ze_cache.cpp b/src/sched/entry/ze/cache/ze_cache.cpp
similarity index 87%
rename from src/sched/entry/ze/ze_cache.cpp
rename to src/sched/entry/ze/cache/ze_cache.cpp
index f954b2544..4132e0149 100644
--- a/src/sched/entry/ze/ze_cache.cpp
+++ b/src/sched/entry/ze/cache/ze_cache.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/global/global.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 
 #include <fcntl.h>
 #include <iterator>
@@ -254,69 +254,6 @@ void event_pool_cache::push(ze_context_handle_t context,
     }
 }
 
-// device_mem_cache
-device_mem_cache::~device_mem_cache() {
-    if (!cache.empty()) {
-        LOG_WARN("device memory cache is not empty, size: ", cache.size());
-        clear();
-    }
-}
-
-void device_mem_cache::clear() {
-    LOG_DEBUG("clear device memory cache: size: ", cache.size());
-    std::lock_guard<std::mutex> lock(mutex);
-    //for (auto& key_value : cache) {
-    // TODO: there is a segfault on this call, when ~cache is invoked w/ or w/0 api cache.
-    // But it passes, when CCL_ZE_CACHE=0 (calls of zeMemAllocDevice and ZeMemFree happen on every iteration).
-    // We don't control destroying phase and may be key_value.second (mem_ptr) is already away to free?
-    // ZE_CALL(zeMemFree, (std::get<0>(key_value.first), key_value.second))
-    //}
-    cache.clear();
-}
-
-void device_mem_cache::get(ze_context_handle_t context,
-                           ze_device_handle_t device,
-                           const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
-                           size_t bytes,
-                           size_t alignment,
-                           void** pptr) {
-    CCL_THROW_IF_NOT(context);
-    CCL_THROW_IF_NOT(device);
-    CCL_THROW_IF_NOT(pptr);
-    std::lock_guard<std::mutex> lock(mutex);
-    if (!get_from_cache(cache,
-                        *pptr,
-                        context,
-                        device,
-                        bytes,
-                        device_mem_alloc_desc.flags,
-                        device_mem_alloc_desc.ordinal)) {
-        ZE_CALL(zeMemAllocDevice,
-                (context, &device_mem_alloc_desc, bytes, alignment, device, pptr));
-    }
-}
-
-void device_mem_cache::push(ze_context_handle_t context,
-                            ze_device_handle_t device,
-                            const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
-                            size_t bytes,
-                            size_t alignment,
-                            void* ptr) {
-    CCL_THROW_IF_NOT(context);
-    CCL_THROW_IF_NOT(device);
-    CCL_THROW_IF_NOT(ptr);
-    std::lock_guard<std::mutex> lock(mutex);
-    if (!push_to_cache(cache,
-                       ptr,
-                       context,
-                       device,
-                       bytes,
-                       device_mem_alloc_desc.flags,
-                       device_mem_alloc_desc.ordinal)) {
-        ZE_CALL(zeMemFree, (context, ptr));
-    }
-}
-
 // module_cache
 module_cache::~module_cache() {
     if (!cache.empty()) {
@@ -563,13 +500,32 @@ void ipc_handle_cache::push(ze_context_handle_t context,
 }
 
 // cache
+cache::cache(size_t instance_count)
+        : instance_count(instance_count),
+          kernels(instance_count),
+          lists(instance_count),
+          queues(instance_count),
+          event_pools(instance_count) {
+    if (global_data::env().ze_device_cache_policy == device_cache_policy_mode::chunk) {
+        for (size_t i = 0; i < instance_count; ++i) {
+            device_mems.push_back(std::make_unique<chunk_device_mem_cache>());
+        }
+    }
+    else if (global_data::env().ze_device_cache_policy == device_cache_policy_mode::plain) {
+        for (size_t i = 0; i < instance_count; ++i) {
+            device_mems.push_back(std::make_unique<plain_device_mem_cache>());
+        }
+    }
+    LOG_DEBUG("create cache with ", instance_count, " instances");
+}
+
 cache::~cache() {
     for (size_t i = 0; i < instance_count; ++i) {
         kernels[i].clear();
         lists[i].clear();
         queues[i].clear();
         event_pools[i].clear();
-        device_mems[i].clear();
+        device_mems[i]->clear();
     }
 
     modules.clear();
diff --git a/src/sched/entry/ze/ze_cache.hpp b/src/sched/entry/ze/cache/ze_cache.hpp
similarity index 86%
rename from src/sched/entry/ze/ze_cache.hpp
rename to src/sched/entry/ze/cache/ze_cache.hpp
index a76f73e1f..9ff4103c6 100644
--- a/src/sched/entry/ze/ze_cache.hpp
+++ b/src/sched/entry/ze/cache/ze_cache.hpp
@@ -17,7 +17,9 @@
 
 #include "common/log/log.hpp"
 #include "common/utils/hash.hpp"
+
 #include "sched/entry/ze/ze_primitives.hpp"
+#include "sched/entry/ze/cache/ze_device_cache.hpp"
 
 #include <unordered_map>
 
@@ -117,38 +119,6 @@ class event_pool_cache {
     std::mutex mutex;
 };
 
-class device_mem_cache {
-public:
-    device_mem_cache() = default;
-    ~device_mem_cache();
-
-    void clear();
-
-    void get(ze_context_handle_t context,
-             ze_device_handle_t device,
-             const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
-             size_t bytes,
-             size_t alignment,
-             void** pptr);
-
-    void push(ze_context_handle_t context,
-              ze_device_handle_t device,
-              const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
-              size_t bytes,
-              size_t alignment,
-              void* ptr);
-
-private:
-    using key_t = typename std::tuple<ze_context_handle_t,
-                                      ze_device_handle_t,
-                                      size_t,
-                                      ze_device_mem_alloc_flags_t,
-                                      uint32_t>;
-    using value_t = void*;
-    std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
-    std::mutex mutex;
-};
-
 struct ipc_handle_desc;
 
 class module_cache {
@@ -266,15 +236,7 @@ class ipc_handle_cache {
 
 class cache {
 public:
-    cache(size_t instance_count)
-            : instance_count(instance_count),
-              kernels(instance_count),
-              lists(instance_count),
-              queues(instance_count),
-              event_pools(instance_count),
-              device_mems(instance_count) {
-        LOG_DEBUG("create cache with ", instance_count, " instances");
-    }
+    cache(size_t instance_count);
     cache(const cache&) = delete;
     cache& operator=(const cache&) = delete;
     ~cache();
@@ -316,10 +278,7 @@ class cache {
              const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
              size_t bytes,
              size_t alignment,
-             void** pptr) {
-        device_mems.at(instance_idx % device_mems.size())
-            .get(context, device, device_mem_alloc_desc, bytes, alignment, pptr);
-    }
+             void** pptr);
 
     void get(ze_context_handle_t context,
              ze_device_handle_t device,
@@ -379,10 +338,7 @@ class cache {
               const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
               size_t bytes,
               size_t alignment,
-              void* ptr) {
-        device_mems.at(instance_idx % device_mems.size())
-            .push(context, device, device_mem_alloc_desc, bytes, alignment, ptr);
-    }
+              void* ptr);
 
 private:
     const size_t instance_count;
@@ -390,7 +346,7 @@ class cache {
     std::vector<list_cache> lists;
     std::vector<queue_cache> queues;
     std::vector<event_pool_cache> event_pools;
-    std::vector<device_mem_cache> device_mems;
+    std::vector<std::unique_ptr<device_mem_cache>> device_mems;
     module_cache modules{};
     mem_handle_cache mem_handles{};
     ipc_handle_cache ipc_handles{};
diff --git a/src/sched/entry/ze/cache/ze_device_cache.cpp b/src/sched/entry/ze/cache/ze_device_cache.cpp
new file mode 100644
index 000000000..84fd57d8c
--- /dev/null
+++ b/src/sched/entry/ze/cache/ze_device_cache.cpp
@@ -0,0 +1,339 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/global/global.hpp"
+#include "sched/entry/ze/cache/ze_device_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
+
+namespace ccl {
+namespace ze {
+
+template <class map_t, class... keys_t>
+bool get_from_cache(map_t& cache, typename map_t::mapped_type& object, keys_t... keys) {
+    bool success{};
+
+    if (!global_data::env().enable_ze_cache)
+        return success;
+
+    typename map_t::key_type key(keys...);
+    auto key_value = cache.find(key);
+    if (key_value != cache.end()) {
+        object = key_value->second;
+        cache.erase(key_value);
+        LOG_DEBUG("loaded from cache: object: ", object);
+        success = true;
+    }
+    return success;
+}
+
+template <class map_t, class... keys_t>
+bool push_to_cache(map_t& cache, const typename map_t::mapped_type& object, keys_t... keys) {
+    bool success{};
+
+    if (!global_data::env().enable_ze_cache)
+        return success;
+
+    typename map_t::key_type key(keys...);
+    auto range = cache.equal_range(key);
+    auto range_len = std::distance(range.first, range.second);
+    if (range_len > 0) {
+        LOG_DEBUG("cache already contain ", range_len, " objects with the same key");
+        for (auto i = range.first; i != range.second; ++i) {
+            CCL_THROW_IF_NOT(i->second != object, "trying to push object that already exists");
+        }
+    }
+    cache.insert({ std::move(key), object });
+    LOG_DEBUG("inserted to cache: object: ", object);
+    success = true;
+    return success;
+}
+
+// plain_device_mem_cache
+plain_device_mem_cache::~plain_device_mem_cache() {
+    if (!cache.empty()) {
+        LOG_WARN("device memory cache is not empty, size: ", cache.size());
+        clear();
+    }
+}
+
+void plain_device_mem_cache::clear() {
+    LOG_DEBUG("clear plain device memory cache: size: ", cache.size());
+    std::lock_guard<std::mutex> lock(mutex);
+    //for (auto& key_value : cache) {
+    // TODO: there is a segfault on this call, when ~cache is invoked w/ or w/0 api cache.
+    // But it passes, when CCL_ZE_CACHE=0 (calls of zeMemAllocDevice and ZeMemFree happen on every iteration).
+    // We don't control destroying phase and may be key_value.second (mem_ptr) is already away to free?
+    // ZE_CALL(zeMemFree, (std::get<0>(key_value.first), key_value.second))
+    //}
+    cache.clear();
+}
+
+void plain_device_mem_cache::get(ze_context_handle_t context,
+                                 ze_device_handle_t device,
+                                 const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                                 size_t bytes,
+                                 size_t alignment,
+                                 void** pptr) {
+    CCL_THROW_IF_NOT(context);
+    CCL_THROW_IF_NOT(device);
+    CCL_THROW_IF_NOT(pptr);
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!get_from_cache(cache,
+                        *pptr,
+                        context,
+                        device,
+                        bytes,
+                        device_mem_alloc_desc.flags,
+                        device_mem_alloc_desc.ordinal)) {
+        ZE_CALL(zeMemAllocDevice,
+                (context, &device_mem_alloc_desc, bytes, alignment, device, pptr));
+    }
+}
+
+void plain_device_mem_cache::push(ze_context_handle_t context,
+                                  ze_device_handle_t device,
+                                  const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                                  size_t bytes,
+                                  size_t alignment,
+                                  void* ptr) {
+    CCL_THROW_IF_NOT(context);
+    CCL_THROW_IF_NOT(device);
+    CCL_THROW_IF_NOT(ptr);
+    std::lock_guard<std::mutex> lock(mutex);
+    if (!push_to_cache(cache,
+                       ptr,
+                       context,
+                       device,
+                       bytes,
+                       device_mem_alloc_desc.flags,
+                       device_mem_alloc_desc.ordinal)) {
+        ZE_CALL(zeMemFree, (context, ptr));
+    }
+}
+
+// chunk implementation
+chunk_device_mem_cache::~chunk_device_mem_cache() {
+    if (!memory_chunks.empty()) {
+        LOG_WARN("device memory cache is not empty, size: ", memory_chunks.size());
+        clear();
+    }
+}
+
+void chunk_device_mem_cache::clear() {
+    LOG_DEBUG("clear plain device memory cache: size: ", memory_chunks.size());
+    std::lock_guard<std::mutex> lock(mutex);
+
+    // TODO: there is a segfault on this call, when ~cache is invoked w/ or w/0 api cache.
+    // free all memory chunks and reset the vector.
+    // for (auto& chunk : memory_chunks) {
+    //     ZE_CALL(zeMemFree, (std::get<0>(key_value.first), chunk.base_ptr));
+    // }
+    memory_chunks.clear();
+}
+
+// get a memory chunk from the cache
+void chunk_device_mem_cache::get(ze_context_handle_t context,
+                                 ze_device_handle_t device,
+                                 const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                                 size_t bytes,
+                                 size_t alignment,
+                                 void** pptr) {
+    CCL_THROW_IF_NOT(context);
+    CCL_THROW_IF_NOT(device);
+    CCL_THROW_IF_NOT(pptr);
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (global_data::env().enable_ze_cache) {
+        // find a suitable memory chunk that has enough space.
+        size_t block_size = bytes;
+        for (auto& chunk : memory_chunks) {
+            if (chunk.block_size < block_size) {
+                LOG_DEBUG("skip chunks with different block size: chunk.block_size: ",
+                          chunk.block_size,
+                          ", block_size: ",
+                          block_size);
+                continue;
+            }
+
+            for (size_t block_idx = 0; block_idx < chunk.num_blocks; block_idx++) {
+                if (!chunk.used_blocks[block_idx]) {
+                    // found a free block in the chunk, use it.
+                    chunk.used_blocks[block_idx] = true;
+                    *pptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(chunk.base_ptr) +
+                                                    block_idx * chunk.block_size);
+                    LOG_DEBUG("loaded from cache: object: ", *pptr);
+                    return;
+                }
+            }
+        }
+
+        // if no suitable block found, allocate a new chunk and use the first block.
+        allocate_new_chunk(device_mem_alloc_desc, context, device, bytes, alignment);
+        *pptr = memory_chunks.back().base_ptr;
+        LOG_DEBUG("allocated new chunk: object: ", *pptr);
+
+        // check memory usage and evict the smallest chunk if necessary
+        if (get_total_cache_size() > global_data::env().ze_device_cache_upper_limit) {
+            if (global_data::env().ze_device_cache_evict_smallest) {
+                evict_smallest_chunk(context);
+            }
+            else {
+                evict_largest_chunk(context);
+            }
+        }
+    }
+    else {
+        ZE_CALL(zeMemAllocDevice,
+                (context, &device_mem_alloc_desc, bytes, alignment, device, pptr));
+        LOG_DEBUG("allocated directly: object: ", *pptr);
+    }
+}
+
+// push a memory chunk back to the cache
+void chunk_device_mem_cache::push(ze_context_handle_t context,
+                                  ze_device_handle_t device,
+                                  const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                                  size_t bytes,
+                                  size_t alignment,
+                                  void* ptr) {
+    CCL_THROW_IF_NOT(context);
+    CCL_THROW_IF_NOT(device);
+    CCL_THROW_IF_NOT(ptr);
+
+    std::lock_guard<std::mutex> lock(mutex);
+
+    if (global_data::env().enable_ze_cache) {
+        // find the corresponding memory chunk and mark the block as free.
+        for (auto& chunk : memory_chunks) {
+            if (reinterpret_cast<uintptr_t>(ptr) - reinterpret_cast<uintptr_t>(chunk.base_ptr) <=
+                chunk.size) {
+                size_t offset =
+                    reinterpret_cast<uintptr_t>(ptr) - reinterpret_cast<uintptr_t>(chunk.base_ptr);
+                size_t block_index = offset / chunk.block_size;
+                chunk.used_blocks[block_index] = false;
+                LOG_DEBUG("pushed to cache: object: ", ptr);
+                // check memory usage and evict the smallest chunk if necessary.
+                if (get_total_cache_size() > global_data::env().ze_device_cache_upper_limit) {
+                    if (global_data::env().ze_device_cache_evict_smallest) {
+                        evict_smallest_chunk(context);
+                    }
+                    else {
+                        evict_largest_chunk(context);
+                    }
+                }
+                return;
+            }
+        }
+    }
+
+    // if the pointer does not belong to any existing chunk, free it directly.
+    ZE_CALL(zeMemFree, (context, ptr));
+    LOG_DEBUG("freed directly: object: ", ptr);
+}
+
+int chunk_device_mem_cache::get_total_cache_size() const {
+    long total_size = 0;
+    for (const auto& chunk : memory_chunks) {
+        total_size += chunk.size;
+    }
+    return total_size;
+}
+
+template <typename ComparisonFunction>
+void chunk_device_mem_cache::evict_chunk(ze_context_handle_t context, ComparisonFunction compFunc) {
+    if (memory_chunks.empty()) {
+        return;
+    }
+
+    auto chunk_it =
+        std::max_element(memory_chunks.begin(),
+                         memory_chunks.end(),
+                         [this, &compFunc](const memory_chunk& a, const memory_chunk& b) {
+                             return compFunc(a, b) && !is_chunk_used(a);
+                         });
+
+    if (chunk_it != memory_chunks.end() && !is_chunk_used(*chunk_it)) {
+        ZE_CALL(zeMemFree, (context, chunk_it->base_ptr));
+        memory_chunks.erase(chunk_it);
+    }
+}
+
+void chunk_device_mem_cache::evict_smallest_chunk(ze_context_handle_t context) {
+    evict_chunk(context, [](const memory_chunk& a, const memory_chunk& b) {
+        return a.size < b.size;
+    });
+}
+
+void chunk_device_mem_cache::evict_largest_chunk(ze_context_handle_t context) {
+    evict_chunk(context, [](const memory_chunk& a, const memory_chunk& b) {
+        return a.size > b.size;
+    });
+}
+
+bool chunk_device_mem_cache::is_chunk_used(const memory_chunk& chunk) const {
+    return std::any_of(chunk.used_blocks.begin(), chunk.used_blocks.end(), [](bool used) {
+        return used;
+    });
+}
+
+// allocate a new memory chunk
+void chunk_device_mem_cache::allocate_new_chunk(
+    const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+    ze_context_handle_t context,
+    ze_device_handle_t device,
+    size_t bytes,
+    size_t alignment) {
+    // define the chunk size as a multiple of the block size to avoid fragmentation.
+    size_t block_size = bytes;
+    size_t num_blocks_per_chunk =
+        global_data::env()
+            .ze_device_cache_num_blocks_in_chunk; // you can adjust this value based on your needs.
+    size_t chunk_size = block_size * num_blocks_per_chunk;
+
+    // allocate the memory chunk and create the memory_chunk structure.
+    void* base_ptr;
+    ZE_CALL(zeMemAllocDevice,
+            (context, &device_mem_alloc_desc, chunk_size, alignment, device, &base_ptr));
+    memory_chunks.emplace_back(chunk_size, block_size);
+    memory_chunks.back().base_ptr = base_ptr;
+    memory_chunks.back().used_blocks[0] = true; // mark the first block as used
+}
+
+// cache
+void cache::get(size_t instance_idx,
+                ze_context_handle_t context,
+                ze_device_handle_t device,
+                const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                size_t bytes,
+                size_t alignment,
+                void** pptr) {
+    device_mems.at(instance_idx % device_mems.size())
+        ->get(context, device, device_mem_alloc_desc, bytes, alignment, pptr);
+}
+
+void cache::push(size_t instance_idx,
+                 ze_context_handle_t context,
+                 ze_device_handle_t device,
+                 const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                 size_t bytes,
+                 size_t alignment,
+                 void* ptr) {
+    device_mems.at(instance_idx % device_mems.size())
+        ->push(context, device, device_mem_alloc_desc, bytes, alignment, ptr);
+}
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/sched/entry/ze/cache/ze_device_cache.hpp b/src/sched/entry/ze/cache/ze_device_cache.hpp
new file mode 100644
index 000000000..9a487e5fc
--- /dev/null
+++ b/src/sched/entry/ze/cache/ze_device_cache.hpp
@@ -0,0 +1,143 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "common/log/log.hpp"
+#include "common/utils/hash.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+#include <unordered_map>
+
+namespace ccl {
+namespace ze {
+
+enum class device_cache_policy_mode : int { plain, chunk, none };
+static std::map<device_cache_policy_mode, std::string> device_cache_policy_names = {
+    std::make_pair(device_cache_policy_mode::plain, "plain"),
+    std::make_pair(device_cache_policy_mode::chunk, "chunk"),
+    std::make_pair(device_cache_policy_mode::none, "none")
+};
+
+class device_mem_cache {
+public:
+    device_mem_cache() = default;
+    virtual ~device_mem_cache() = default;
+
+    virtual void clear() = 0;
+
+    virtual void get(ze_context_handle_t context,
+                     ze_device_handle_t device,
+                     const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                     size_t bytes,
+                     size_t alignment,
+                     void** pptr) = 0;
+
+    virtual void push(ze_context_handle_t context,
+                      ze_device_handle_t device,
+                      const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                      size_t bytes,
+                      size_t alignment,
+                      void* ptr) = 0;
+};
+
+class plain_device_mem_cache : public device_mem_cache {
+public:
+    plain_device_mem_cache() = default;
+    ~plain_device_mem_cache();
+
+    void clear();
+
+    void get(ze_context_handle_t context,
+             ze_device_handle_t device,
+             const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+             size_t bytes,
+             size_t alignment,
+             void** pptr);
+
+    void push(ze_context_handle_t context,
+              ze_device_handle_t device,
+              const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+              size_t bytes,
+              size_t alignment,
+              void* ptr);
+
+private:
+    using key_t = typename std::tuple<ze_context_handle_t,
+                                      ze_device_handle_t,
+                                      size_t,
+                                      ze_device_mem_alloc_flags_t,
+                                      uint32_t>;
+    using value_t = void*;
+    std::unordered_multimap<key_t, value_t, utils::tuple_hash> cache;
+    std::mutex mutex;
+};
+
+// chunk_device_mem_cache
+class chunk_device_mem_cache : public device_mem_cache {
+public:
+    chunk_device_mem_cache() = default;
+    ~chunk_device_mem_cache();
+    void clear();
+
+    void get(ze_context_handle_t context,
+             ze_device_handle_t device,
+             const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+             size_t bytes,
+             size_t alignment,
+             void** pptr);
+
+    void push(ze_context_handle_t context,
+              ze_device_handle_t device,
+              const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+              size_t bytes,
+              size_t alignment,
+              void* ptr);
+
+private:
+    struct memory_chunk {
+        size_t size;
+        size_t block_size;
+        size_t num_blocks;
+        void* base_ptr;
+        std::vector<bool> used_blocks;
+
+        memory_chunk(size_t chunk_size, size_t block_size)
+                : size(chunk_size),
+                  block_size(block_size),
+                  num_blocks(chunk_size / block_size),
+                  base_ptr(nullptr),
+                  used_blocks(chunk_size / block_size, false) {}
+    };
+
+    void allocate_new_chunk(const ze_device_mem_alloc_desc_t& device_mem_alloc_desc,
+                            ze_context_handle_t context,
+                            ze_device_handle_t device,
+                            size_t bytes,
+                            size_t alignment);
+
+    template <typename ComparisonFunction>
+    void evict_chunk(ze_context_handle_t context, ComparisonFunction compFunc);
+    void evict_smallest_chunk(ze_context_handle_t context);
+    void evict_largest_chunk(ze_context_handle_t context);
+
+    int get_total_cache_size() const;
+    bool is_chunk_used(const memory_chunk& chunk) const;
+    std::vector<memory_chunk> memory_chunks;
+    std::mutex mutex;
+};
+
+} // namespace ze
+} // namespace ccl
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
index 7d3665541..e23e24e78 100644
--- a/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "sched/entry/ze/ze_a2a_allgatherv_entry.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 #include <numeric>
@@ -35,19 +35,23 @@ ze_a2a_allgatherv_entry::ze_a2a_allgatherv_entry(ccl_sched* sched,
                                                  size_t peer_buf_offset,
                                                  bool is_monolithic_pipeline,
                                                  ccl_comm* pipeline_comm,
-                                                 bool is_separate_block_handles)
+                                                 bool is_separate_block_handles,
+                                                 bool is_scaleout,
+                                                 size_t scaleout_offset)
         : ze_base_entry(sched, wait_events, comm, comm->size() * event_group_count),
           send_buf(send_buf),
           send_count(send_count),
-          recv_bufs(recv_bufs),
-          recv_counts(recv_counts),
+          recv_bufs(std::move(recv_bufs)),
+          recv_counts(std::move(recv_counts)),
           dtype(dtype),
           peer_buf_idx(peer_buf_idx),
           peer_buf_offset(peer_buf_offset),
           peer_count(comm->size() - 1),
           is_monolithic_pipeline(is_monolithic_pipeline),
           pipeline_comm(pipeline_comm),
-          is_separate_block_handles(is_separate_block_handles) {}
+          is_separate_block_handles(is_separate_block_handles),
+          is_scaleout(is_scaleout),
+          scaleout_offset(scaleout_offset) {}
 
 void ze_a2a_allgatherv_entry::init_ze_hook() {
     /* get peer recv buffers */
@@ -56,16 +60,28 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
 
     for (int i = 0; i < peer_count; ++i) {
         const int peer_rank = (comm_rank + i + 1) % comm->size();
+        int peer_global_rank = comm->get_global_rank(peer_rank);
+        if (is_scaleout) {
+            // recv_bufs.size() gives size of global communicator
+            peer_global_rank = (peer_global_rank + scaleout_offset) % recv_bufs.size();
+        }
+
         ccl_buffer buf{};
-        sched->get_memory().handle_manager.get(peer_rank, peer_buf_idx, buf, comm);
-        CCL_THROW_IF_NOT(buf.get_ptr(), "null IPC buffer is received");
+        if (!(is_monolithic_pipeline && recv_counts.at(peer_rank) == 0)) {
+            if (is_scaleout) {
+                sched->get_memory().handle_manager.get(peer_rank, 1 + peer_global_rank, buf, comm);
+            }
+            else {
+                sched->get_memory().handle_manager.get(peer_rank, peer_buf_idx, buf, comm);
+            }
+            CCL_THROW_IF_NOT(buf.get_ptr(), "null IPC buffer is received");
+        }
         peer_recv_bufs[peer_rank] = buf;
 
         if (is_monolithic_pipeline && pipeline_comm != nullptr) {
             // get peer buffer handles with pair_comm peer when pair_comm size > 1
             if (pipeline_comm->size() > 1) {
                 ccl_buffer buf_pair{};
-                const int peer_global_rank = comm->get_global_rank(peer_rank);
                 // currently pipeline_comm is pair_comm and it can have a maximum of 2 ranks
                 CCL_THROW_IF_NOT(pipeline_comm->size() == 2,
                                  "algorithm only supports pipeline_comm of size 2");
@@ -75,8 +91,11 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
                 // when separate handles are not there, use the idx parameter for all peers.
                 const size_t pair_peer_buf_idx =
                     (is_separate_block_handles) ? 1 + peer_global_rank : peer_buf_idx;
-                sched->get_memory().handle_manager.get(
-                    pair_peer_rank, pair_peer_buf_idx, buf_pair, pipeline_comm);
+                // only get the buffer if it exists; otherwise, segfault occurs
+                if (recv_counts[pair_peer_rank] > 0) {
+                    sched->get_memory().handle_manager.get(
+                        pair_peer_rank, pair_peer_buf_idx, buf_pair, pipeline_comm);
+                }
                 pair_peer_recv_bufs[peer_rank] = buf_pair;
             }
             else {
@@ -86,7 +105,10 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
     }
 
     bool is_inplace{};
-    if (send_buf == recv_bufs.at(comm_rank)) {
+    // is_separate_block_handles is used from allgatherv
+    // topo which performs copy incase data is not inplace
+    // and therefore we do not need a copy here
+    if (is_separate_block_handles || send_buf == recv_bufs.at(comm_rank)) {
         is_inplace = true;
     }
     std::vector<size_t> rank_buf_offsets(comm_size);
@@ -94,10 +116,12 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
         rank_buf_offsets[i] = rank_buf_offsets[i - 1] + recv_counts[i - 1];
     }
 
-    CCL_THROW_IF_NOT(send_count == recv_counts[comm_rank],
+    CCL_THROW_IF_NOT(is_separate_block_handles || send_count == recv_counts[comm_rank],
                      "allgatherv send_count :",
                      send_count,
-                     " and recv_count :",
+                     " and recv_count of rank ",
+                     comm_rank,
+                     ":",
                      recv_counts[comm_rank],
                      " does not match");
 
@@ -114,6 +138,7 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
     if (dtype == ccl::datatype::int8) {
         is_monolithic = false;
     }
+
     // when monolithic pipelined kernel is used, two events
     // are needed for the unaligned and aligned kernels.
     // otherwise we use an event for copying from each peer.
@@ -123,7 +148,9 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
         copy_events_size = (int)ccl::utils::align_kernels::count;
     }
     // need additional memcpy for non inplace data
-    if (!is_inplace) {
+    if (!is_inplace && std::any_of(recv_counts.begin(), recv_counts.end(), [](auto& i) {
+            return i > 0;
+        })) {
         copy_events_size++;
     }
     copy_events.resize(copy_events_size);
@@ -147,21 +174,17 @@ void ze_a2a_allgatherv_entry::init_ze_hook() {
                                      peer_buf_offset,
                                      copy_events,
                                      wait_events,
+                                     ze_base_entry::entry_event,
                                      is_monolithic,
                                      is_monolithic_pipeline,
                                      is_inplace,
-                                     is_separate_block_handles);
+                                     is_separate_block_handles,
+                                     is_scaleout,
+                                     scaleout_offset);
     ze_a2a_allgatherv_op::select(init_params, kernels);
 }
 
 void ze_a2a_allgatherv_entry::update() {
-    for (const auto& event : copy_events) {
-        if (!ze_base_entry::is_event_completed(event)) {
-            return;
-        }
-    }
-
-    ZE_CALL(zeEventHostSignal, (ze_base_entry::entry_event));
     ze_base_entry::update();
 }
 
@@ -192,10 +215,13 @@ ze_a2a_allgatherv_op::ze_a2a_allgatherv_op(ccl_sched* sched,
                                            size_t peer_buf_offset,
                                            std::vector<ze_event_handle_t>& copy_events,
                                            std::vector<ze_event_handle_t>& wait_events,
+                                           ze_event_handle_t out_event,
                                            bool is_monolithic,
                                            bool is_monolithic_pipeline,
                                            bool is_inplace,
-                                           bool is_separate_block_handles)
+                                           bool is_separate_block_handles,
+                                           bool is_scaleout,
+                                           size_t scaleout_offset)
         : sched(sched),
           entry(entry),
           comm(comm),
@@ -212,51 +238,70 @@ ze_a2a_allgatherv_op::ze_a2a_allgatherv_op(ccl_sched* sched,
           peer_buf_offset(peer_buf_offset),
           copy_events(copy_events),
           wait_events(wait_events),
+          out_event(out_event),
           is_monolithic(is_monolithic),
           is_monolithic_pipeline(is_monolithic_pipeline),
           is_inplace(is_inplace),
-          is_separate_block_handles(is_separate_block_handles) {}
+          is_separate_block_handles(is_separate_block_handles),
+          is_scaleout(is_scaleout),
+          scaleout_offset(scaleout_offset) {}
 
 // main function to choose read/write operation for a2a_allgatherv
 void ze_a2a_allgatherv_op::select(ze_a2a_allgatherv_op& args, std::vector<ze_kernel>& kernels) {
+    size_t num_op_events = 0;
     if (args.is_monolithic_pipeline) {
         // read data using xelink and then write that data through mdfi
         // input events: wait_events
-        // output event: copy_events[0]
+        // output event: copy_events[0..1]
         ze_a2a_allgatherv_op::read_write(args, kernels);
+        num_op_events = 2;
     }
     else if (ccl::global_data::env().allgatherv_topo_read) {
         // input events: wait_events
         // output event(s): copy_events[0..peer_count-1]
         ze_a2a_allgatherv_op::read(args);
+        num_op_events = args.peer_count;
     }
     else {
         // input events: wait_events
         // output event(s): copy_events[0..peer_count-1]
         ze_a2a_allgatherv_op::write(args, kernels);
+        num_op_events = args.peer_count;
     }
 
-    if (!args.is_inplace) {
-        // args.wait_events must be updated to copy_events[0 .. copy_events.size()-1]
-        args.wait_events.clear();
-        args.wait_events.reserve(args.copy_events.size() - 1);
-        std::copy(
-            args.copy_events.begin(), args.copy_events.end() - 1, back_inserter(args.wait_events));
+    CCL_ASSERT(args.copy_events.size() >= num_op_events);
+    std::vector<ze_event_handle_t> op_events(args.copy_events.begin(),
+                                             args.copy_events.begin() + num_op_events);
+    CCL_ASSERT(op_events.size() == num_op_events);
+
+    auto list = args.entry->get_copy_list(copy_direction::t2t);
 
+    if (!args.is_inplace && args.copy_bytes.at(args.comm->rank()) > 0) {
         // copy send_buf to my buffer
         void* dst = args.recv_bufs.at(args.comm->rank()).get_ptr();
         if (args.is_monolithic_pipeline) {
+            // TODO: how is this going to work in all cases? what if comm is !world_comm? Then my_global_rank will point to an incorrect rank. Which, at the very least, can get us referencing "recv_bufs[much_larger_than_size]".
             const int my_global_rank = args.comm->get_global_rank(args.comm->rank());
             dst = args.recv_bufs.at(my_global_rank).get_ptr();
         }
         ZE_APPEND_CALL_TO_ENTRY(args.entry,
                                 ze_cmd_memory_copy,
-                                args.entry->get_copy_list(copy_direction::t2t),
+                                list,
                                 dst,
                                 args.send_buf.get_ptr(), // src
                                 args.copy_bytes.at(args.comm->rank()),
-                                args.copy_events.back(),
-                                args.wait_events);
+                                args.out_event,
+                                op_events);
+    }
+    else {
+        // case:
+        //      copy of zero buffer, no ze_cmd_memory_copy occured
+        //      signalling copy_events[i] by hand is required for sync purposes
+        //      ze_cmd_memory_copy cannot be called:
+        //          src_buf might be null, which causes segfault
+        //      copy_event cannot be skipped - it is referenced later
+
+        ZE_APPEND_CALL_TO_ENTRY(args.entry, ze_cmd_barrier, list, args.out_event, op_events);
     }
 }
 
@@ -293,6 +338,10 @@ void ze_a2a_allgatherv_op::read_write(ze_a2a_allgatherv_op& args, std::vector<ze
             // when using separate handles use the global rank
             // since user passes the count array with global index
             base_index_local = a.comm->get_global_rank(peer_rank);
+            if (a.is_scaleout) {
+                // recv_bufs.size() gives size of global communicator
+                base_index_local = (base_index_local + a.scaleout_offset) % a.recv_bufs.size();
+            }
         }
         else {
             // when using non-separate handles we fill the data
@@ -305,9 +354,12 @@ void ze_a2a_allgatherv_op::read_write(ze_a2a_allgatherv_op& args, std::vector<ze
 
         const size_t count = a.recv_counts[base_index_local];
 
-        const void* peer_even_buf = (a.peer_bufs[peer_rank] + offset).get_ptr();
+        const void* peer_even_buf =
+            a.peer_bufs[peer_rank] ? (a.peer_bufs[peer_rank] + offset).get_ptr() : nullptr;
         const void* local_buf = a.recv_bufs[base_index_local].get_ptr();
-        const void* peer_pair_buf = (a.pair_peer_bufs[peer_rank] + offset).get_ptr();
+        const void* peer_pair_buf = a.pair_peer_bufs[peer_rank]
+                                        ? (a.pair_peer_bufs[peer_rank] + offset).get_ptr()
+                                        : nullptr;
 
         // based on the starting address of local recv buffer,
         // create the unaligned and aligned kernels, based on the assumption
@@ -333,14 +385,26 @@ void ze_a2a_allgatherv_op::read_write(ze_a2a_allgatherv_op& args, std::vector<ze
         local_bufs[aligned_kernel_idx].push_back((char*)local_buf + offset_byte);
         peer_pair_bufs[aligned_kernel_idx].push_back((char*)peer_pair_buf + offset_byte);
     }
-
     for (int i = 0; i < kernel_count; i++) {
         int pipeline_comm_size = a.pipeline_comm->size();
 
-        ze_kernel_args_t kernel_args{
-            &pipeline_comm_size, peer_even_bufs[i], local_bufs[i], peer_pair_bufs[i], counts[i]
+        ze_kernel_args_t kernel_args{ &pipeline_comm_size };
+        auto try_push_arg = [&kernel_args](const std::vector<const void*>& buffers) {
+            for (auto& buffer : buffers) {
+                if (buffer) {
+                    kernel_args.push_back(&buffer);
+                }
+                else {
+                    kernel_args.push_back({});
+                }
+            }
         };
 
+        try_push_arg(peer_even_bufs[i]);
+        try_push_arg(local_bufs[i]);
+        try_push_arg(peer_pair_bufs[i]);
+        kernel_args.push_back(counts[i]);
+
         auto this_count = *std::max_element(counts[i].begin(), counts[i].end());
         ze_kernel kernel(module, monolithic_kernel_name, kernel_args, this_count, worker_idx);
 
@@ -371,14 +435,28 @@ void ze_a2a_allgatherv_op::read(ze_a2a_allgatherv_op& args) {
                       .get_ptr();
         }
 
-        ZE_APPEND_CALL_TO_ENTRY(args.entry,
-                                ze_cmd_memory_copy,
-                                a.entry->get_copy_list(copy_direction::c2c, i),
-                                a.recv_bufs[peer_rank].get_ptr(),
-                                src,
-                                a.copy_bytes.at(peer_rank),
-                                a.copy_events.at(i),
-                                a.wait_events);
+        auto list = a.entry->get_copy_list(copy_direction::c2c, i);
+
+        if (a.copy_bytes.at(peer_rank)) {
+            ZE_APPEND_CALL_TO_ENTRY(args.entry,
+                                    ze_cmd_memory_copy,
+                                    list,
+                                    a.recv_bufs[peer_rank].get_ptr(),
+                                    src,
+                                    a.copy_bytes.at(peer_rank),
+                                    a.copy_events.at(i),
+                                    a.wait_events);
+        }
+        else {
+            // case:
+            //      copy of zero buffer, no ze_cmd_memory_copy occured
+            //      signalling copy_events[i] by hand is required for sync purposes
+            //      ze_cmd_memory_copy cannot be called:
+            //          src_buf might be null, which causes segfault
+            //      copy_event cannot be skipped - it is referenced later
+            ZE_APPEND_CALL_TO_ENTRY(
+                args.entry, ze_cmd_barrier, list, a.copy_events.at(i), a.wait_events);
+        }
     }
 }
 
@@ -408,15 +486,33 @@ void ze_a2a_allgatherv_op::write(ze_a2a_allgatherv_op& args, std::vector<ze_kern
         else {
             // request copy engine at even index, can be helpful in certain situation
             // TODO: if we on the same device, then use t2t direction
-            auto list = a.entry->get_copy_list(copy_direction::c2c, (i + 1) * 2);
-            ZE_APPEND_CALL_TO_ENTRY(args.entry,
-                                    ze_cmd_memory_copy,
-                                    list,
-                                    dst_buf.get_ptr(),
-                                    src_buf.get_ptr(),
-                                    a.copy_bytes.at(a.comm->rank()),
-                                    a.copy_events.at(i),
-                                    a.wait_events);
+            auto copy_engine_idx = (i + 1) * 2;
+            if (ccl::global_data::env().type2_mode == type2_tune_mode::detected ||
+                ccl::global_data::env().type2_mode == type2_tune_mode::on) {
+                copy_engine_idx = i * 2;
+            }
+
+            auto list = a.entry->get_copy_list(copy_direction::c2c, copy_engine_idx);
+            if (a.copy_bytes.at(a.comm->rank()) > 0) {
+                ZE_APPEND_CALL_TO_ENTRY(args.entry,
+                                        ze_cmd_memory_copy,
+                                        list,
+                                        dst_buf.get_ptr(),
+                                        src_buf.get_ptr(),
+                                        a.copy_bytes.at(a.comm->rank()),
+                                        a.copy_events.at(i),
+                                        a.wait_events);
+            }
+            else {
+                // case:
+                // copy of zero buffer, no ze_cmd_memory_copy occured
+                // signalling copy_events[i] by hand is required for sync purposes
+                // ze_cmd_memory_copy cannot be called:
+                //    src_buf might be null, which causes segfault
+                // copy_event cannot be skipped - it is referenced later
+                ZE_APPEND_CALL_TO_ENTRY(
+                    args.entry, ze_cmd_barrier, list, a.copy_events.at(i), a.wait_events);
+            }
         }
     }
     if (a.is_monolithic) {
diff --git a/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp b/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
index 62edb827e..cd51ade53 100644
--- a/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
+++ b/src/sched/entry/ze/ze_a2a_allgatherv_entry.hpp
@@ -46,7 +46,9 @@ class ze_a2a_allgatherv_entry : public ze_base_entry {
         ccl_comm* pipeline_comm = nullptr,
         // whether ipc handle exchange done only for start of the buffer
         // or separate handles for the buffer partition of each rank
-        bool is_separate_block_handles = true);
+        bool is_separate_block_handles = false,
+        bool is_scaleout = false,
+        size_t scaleout_offset = 0);
 
     void init_ze_hook() override;
     void update() override;
@@ -68,6 +70,8 @@ class ze_a2a_allgatherv_entry : public ze_base_entry {
     const bool is_monolithic_pipeline;
     ccl_comm* pipeline_comm;
     const bool is_separate_block_handles;
+    const bool is_scaleout;
+    const size_t scaleout_offset;
 
     std::vector<ze_event_handle_t> copy_events;
     std::vector<ze_kernel> kernels;
@@ -95,10 +99,13 @@ class ze_a2a_allgatherv_op {
                          size_t peer_buf_offset,
                          std::vector<ze_event_handle_t>& copy_events,
                          std::vector<ze_event_handle_t>& wait_events,
+                         ze_event_handle_t out_event,
                          bool is_monolithic,
                          bool is_monolithic_pipeline,
                          bool is_inplace = false,
-                         bool is_separate_block_handles = true);
+                         bool is_separate_block_handles = true,
+                         bool is_scaleout = false,
+                         size_t scaleout_offset = 0);
     // methods
     static void select(ze_a2a_allgatherv_op& args, std::vector<ze_kernel>& kernels);
     // common
@@ -123,11 +130,14 @@ class ze_a2a_allgatherv_op {
     // events
     std::vector<ze_event_handle_t>& copy_events;
     std::vector<ze_event_handle_t>& wait_events;
+    ze_event_handle_t out_event;
     // flags
     bool is_monolithic;
     bool is_monolithic_pipeline;
     bool is_inplace;
     bool is_separate_block_handles;
+    bool is_scaleout;
+    size_t scaleout_offset;
 
 private:
     static void read_write(ze_a2a_allgatherv_op& args, std::vector<ze_kernel>& kernels);
diff --git a/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp b/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp
index 2b437a218..6ce19c2fe 100644
--- a/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.cpp
@@ -17,7 +17,7 @@
 #include "comp/comp.hpp"
 #include "sched/entry/ze/ze_a2a_pipeline_reduce_scatter_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/queue/queue.hpp"
 #include "coll/coll_util.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
@@ -43,7 +43,7 @@ ze_a2a_pipeline_read_write_entry::ze_a2a_pipeline_read_write_entry(
     const attr& attrs)
         : ze_base_entry(sched, wait_events, comm, 1 /* request additional events */),
           send_buf(send_buf),
-          tmp_bufs(tmp_bufs),
+          tmp_bufs(std::move(tmp_bufs)),
           tmp_buf_idx_start(tmp_buf_idx_start),
           count(count),
           dtype(dtype),
@@ -164,7 +164,7 @@ ze_a2a_pipeline_reduce_entry::ze_a2a_pipeline_reduce_entry(
     const std::vector<ze_event_handle_t>& wait_events)
         : ze_base_entry(sched, wait_events, comm, 1 /* request additional events */),
           recv_buf(recv_buf),
-          tmp_bufs(tmp_bufs),
+          tmp_bufs(std::move(tmp_bufs)),
           count(count),
           dtype(dtype),
           op(op) {}
diff --git a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
index 152395337..c8ca8cdd9 100644
--- a/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
+++ b/src/sched/entry/ze/ze_a2a_reduce_scatter_entry.cpp
@@ -15,7 +15,7 @@
 */
 #include "comp/comp.hpp"
 #include "sched/entry/ze/ze_a2a_reduce_scatter_entry.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/entry/factory/entry_factory.hpp"
 
@@ -126,6 +126,7 @@ void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
         std::string kernel_name = "reduce_single_local_inplace_kernel_" + to_string(dtype.idx()) +
                                   "_" + ccl_reduction_to_str(op);
 
+        LOG_DEBUG("get kernel_name: ", kernel_name);
         // reduce peer values in tmp_buf and own values in send_buf into tmp_buf
         kernels.reserve(1);
         void* input_buf = static_cast<char*>(send_buf) + rank_buf_offset * dtype.size();
@@ -136,7 +137,7 @@ void ze_a2a_reduce_scatter_entry::kernel_init(size_t rank_buf_offset,
     else {
         std::string kernel_name = "reduce_local_inplace_kernel_" + to_string(dtype.idx()) + "_" +
                                   ccl_reduction_to_str(op);
-
+        LOG_DEBUG("get kernel_name: ", kernel_name);
         // reduce peer values in tmp_buf only
         kernels.reserve(peer_count);
         for (int i = 1; i < peer_count; ++i) {
@@ -200,7 +201,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
     CCL_ASSERT(kernels.size(),
                "expecting to launch monolithic kernel(s), but no kernels were passed");
     if (is_monolithic && peer_count <= (int)ccl::ze::max_peer_count) {
-        // reduce stage
+        LOG_DEBUG("reduce stage with kernels");
         for (size_t i = 0; i < kernels.size(); ++i) {
             ZE_APPEND_CALL_TO_ENTRY(entry,
                                     ze_cmd_launch_kernel,
@@ -223,6 +224,7 @@ void ze_a2a_reduce_scatter_entry::fill_list(const ze_base_entry* entry,
         }
     }
     else {
+        LOG_DEBUG("copy peer segments to temp buffer");
         size_t copy_bytes = block_count * dtype.size();
         /* copy peer segments to temp buffer */
         for (int i = 0; i < peer_count; i++) {
@@ -322,8 +324,8 @@ void ze_a2a_reduce_scatter_entry::init_ze_hook() {
         event = ze_base_entry::create_event();
     }
 
-    size_t rank_buf_offset =
-        std::accumulate(recv_counts.begin(), recv_counts.begin() + comm_rank, 0);
+    size_t rank_buf_offset = std::accumulate(
+        recv_counts.begin(), recv_counts.begin() + comm_rank, ccl::utils::initial_count_value);
 
     barrier_event = ze_base_entry::create_event();
 
@@ -379,7 +381,9 @@ void ze_a2a_reduce_scatter_entry::update() {
 std::string ze_a2a_reduce_scatter_entry::name_ext() const {
     std::stringstream out;
     out << name() << ":"
-        << std::accumulate(recv_counts.begin(), recv_counts.end(), 0) * dtype.size();
+        << std::accumulate(
+               recv_counts.begin(), recv_counts.end(), ccl::utils::initial_count_value) *
+               dtype.size();
     return out.str();
 }
 
@@ -390,7 +394,7 @@ ze_a2a_reduce_scatter_write_copy_entry::ze_a2a_reduce_scatter_write_copy_entry(
     const std::vector<ze_event_handle_t>& wait_events)
         : ze_base_entry(sched, wait_events, rs_args.comm, rs_args.comm->size() * event_group_count),
           rs_args(rs_args),
-          rs_bufs(rs_bufs),
+          rs_bufs(std::move(rs_bufs)),
           peer_count(comm->size() - 1) {}
 
 void ze_a2a_reduce_scatter_write_copy_entry::fill_list_copy(
@@ -408,11 +412,14 @@ void ze_a2a_reduce_scatter_write_copy_entry::fill_list_copy(
     ze_context_handle_t context,
     size_t worker_idx,
     const std::vector<ze_event_handle_t>& wait_events) {
+    LOG_DEBUG("use fill_list_copy");
+
     for (int i = 0; i < peer_count; i++) {
         const int peer_rank = (comm_rank + i + 1) % comm_size;
         size_t copy_bytes_peer = rs_args.recv_counts[peer_rank] * rs_args.dtype.size();
-        size_t peer_rank_buf_offset = std::accumulate(
-            rs_args.recv_counts.begin(), rs_args.recv_counts.begin() + peer_rank, 0);
+        size_t peer_rank_buf_offset = std::accumulate(rs_args.recv_counts.begin(),
+                                                      rs_args.recv_counts.begin() + peer_rank,
+                                                      ccl::utils::initial_count_value);
         void* src_write = static_cast<char*>(rs_bufs.send_buf.get_ptr()) +
                           (peer_rank_buf_offset + rs_bufs.send_buf_offset) * rs_args.dtype.size();
         // write to tmp_buffer without creating any gap in the buffer:
@@ -424,7 +431,13 @@ void ze_a2a_reduce_scatter_write_copy_entry::fill_list_copy(
         void* dst_write =
             static_cast<char*>(peer_recv_bufs[i].get_ptr()) + peer_block_offset * copy_bytes_peer;
         // request copy engine at even index, it can be helpful in certain situations
-        auto list = entry->get_copy_list(copy_direction::c2c, (i + 1) * 2);
+
+        auto copy_engine_idx = (i + 1) * 2;
+        if (ccl::global_data::env().type2_mode == type2_tune_mode::detected ||
+            ccl::global_data::env().type2_mode == type2_tune_mode::on) {
+            copy_engine_idx = i * 2;
+        }
+        auto list = entry->get_copy_list(copy_direction::c2c, copy_engine_idx);
 
         ZE_APPEND_CALL_TO_ENTRY(entry,
                                 ze_cmd_memory_copy,
@@ -488,7 +501,9 @@ void ze_a2a_reduce_scatter_write_copy_entry::update() {
 std::string ze_a2a_reduce_scatter_write_copy_entry::name_ext() const {
     std::stringstream out;
     out << name() << ":"
-        << std::accumulate(rs_args.recv_counts.begin(), rs_args.recv_counts.end(), 0) *
+        << std::accumulate(rs_args.recv_counts.begin(),
+                           rs_args.recv_counts.end(),
+                           ccl::utils::initial_count_value) *
                rs_args.dtype.size();
     return out.str();
 }
@@ -500,7 +515,7 @@ ze_a2a_reduce_scatter_write_kernel_entry::ze_a2a_reduce_scatter_write_kernel_ent
     const std::vector<ze_event_handle_t>& wait_events)
         : ze_base_entry(sched, wait_events, rs_args.comm, rs_args.comm->size() * event_group_count),
           rs_args(rs_args),
-          rs_bufs(rs_bufs),
+          rs_bufs(std::move(rs_bufs)),
           peer_count(rs_args.comm->size() - 1) {}
 
 void ze_a2a_reduce_scatter_write_kernel_entry::kernel_init(size_t rank_buf_offset,
@@ -523,6 +538,7 @@ void ze_a2a_reduce_scatter_write_kernel_entry::kernel_init(size_t rank_buf_offse
                                   to_string(rs_args.dtype.idx()) + "_" +
                                   ccl_reduction_to_str(rs_args.op);
 
+        LOG_DEBUG("get kernel name: ", kernel_name);
         // reduce peer values in tmp_buf and own values in send_buf into tmp_buf
         kernels.reserve(1);
         void* input_buf = static_cast<char*>(rs_bufs.send_buf.get_ptr()) +
@@ -535,6 +551,8 @@ void ze_a2a_reduce_scatter_write_kernel_entry::kernel_init(size_t rank_buf_offse
     else {
         std::string kernel_name = "reduce_local_inplace_kernel_" + to_string(rs_args.dtype.idx()) +
                                   "_" + ccl_reduction_to_str(rs_args.op);
+        LOG_DEBUG("get kernel name: ", kernel_name);
+
         kernels.reserve(peer_count);
         for (int i = 1; i < peer_count; ++i) {
             void* input_buf = static_cast<char*>(rs_bufs.send_buf.get_ptr()) +
@@ -595,6 +613,14 @@ void ze_a2a_reduce_scatter_write_kernel_entry::fill_list_kernel(
 void ze_a2a_reduce_scatter_write_kernel_entry::init_ze_hook() {
     size_t buf_bytes = rs_args.dtype.size() * rs_args.recv_counts[comm_rank];
 
+    if (!buf_bytes) {
+        ZE_APPEND_CALL(ze_cmd_barrier,
+                       ze_base_entry::get_copy_list(),
+                       ze_base_entry::entry_event,
+                       wait_events);
+        return;
+    }
+
     if (ccl::global_data::env().enable_kernel_single_reduce_peers) {
         kernel_events.resize(1);
     }
@@ -605,8 +631,9 @@ void ze_a2a_reduce_scatter_write_kernel_entry::init_ze_hook() {
         event = ze_base_entry::create_event();
     }
 
-    size_t rank_buf_offset =
-        std::accumulate(rs_args.recv_counts.begin(), rs_args.recv_counts.begin() + comm_rank, 0);
+    size_t rank_buf_offset = std::accumulate(rs_args.recv_counts.begin(),
+                                             rs_args.recv_counts.begin() + comm_rank,
+                                             ccl::utils::initial_count_value);
 
     barrier_event = ze_base_entry::create_event();
 
@@ -633,9 +660,6 @@ void ze_a2a_reduce_scatter_write_kernel_entry::init_ze_hook() {
                    buf_bytes,
                    ze_base_entry::entry_event,
                    kernel_events);
-
-    ZE_APPEND_CALL(
-        ze_cmd_barrier, ze_base_entry::get_copy_list(), ze_base_entry::entry_event, kernel_events);
 }
 
 void ze_a2a_reduce_scatter_write_kernel_entry::update() {
@@ -645,7 +669,9 @@ void ze_a2a_reduce_scatter_write_kernel_entry::update() {
 std::string ze_a2a_reduce_scatter_write_kernel_entry::name_ext() const {
     std::stringstream out;
     out << name() << ":"
-        << std::accumulate(rs_args.recv_counts.begin(), rs_args.recv_counts.end(), 0) *
+        << std::accumulate(rs_args.recv_counts.begin(),
+                           rs_args.recv_counts.end(),
+                           ccl::utils::initial_count_value) *
                rs_args.dtype.size();
     return out.str();
 }
diff --git a/src/sched/entry/ze/ze_alltoallv_entry.cpp b/src/sched/entry/ze/ze_alltoallv_entry.cpp
index 2ba94055f..bd3aba9b9 100644
--- a/src/sched/entry/ze/ze_alltoallv_entry.cpp
+++ b/src/sched/entry/ze/ze_alltoallv_entry.cpp
@@ -17,7 +17,7 @@
 #include "comp/comp.hpp"
 #include "sched/entry/ze/ze_alltoallv_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/queue/queue.hpp"
 
 #include <string>
@@ -35,9 +35,9 @@ ze_alltoallv_entry::ze_alltoallv_entry(ccl_sched* sched,
                                        ccl_comm* comm,
                                        const std::vector<ze_event_handle_t>& wait_events)
         : ze_base_entry(sched, wait_events, comm, 1 /* request additional events */),
-          send_bufs(send_bufs),
-          recv_bufs(recv_bufs),
-          counts(counts),
+          send_bufs(std::move(send_bufs)),
+          recv_bufs(std::move(recv_bufs)),
+          counts(std::move(counts)),
           buf_idx_start(buf_idx_start),
           dtype(dtype) {}
 
diff --git a/src/sched/entry/ze/ze_base_entry.cpp b/src/sched/entry/ze/ze_base_entry.cpp
index 8841c0554..c9d43253f 100644
--- a/src/sched/entry/ze/ze_base_entry.cpp
+++ b/src/sched/entry/ze/ze_base_entry.cpp
@@ -19,7 +19,7 @@
 #include "common/global/global.hpp"
 #include "common/api_wrapper/ze_api_wrapper.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_call.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/sched.hpp"
@@ -119,9 +119,14 @@ void ze_base_entry::init_entries() {
     auto &entries = sched->ze_entries;
     if (entries.front() == this) {
         LOG_DEBUG("init ", entries.size(), " entries");
+
         for (auto &entry : entries) {
             entry->init();
         }
+        auto sync_obj = sched->get_init_ze_hook_sync_obj();
+        if (sync_obj) {
+            sync_obj->visit();
+        }
     }
 }
 
@@ -136,11 +141,6 @@ void ze_base_entry::finalize_entries() {
 }
 
 void ze_base_entry::start() {
-#ifdef CCL_ENABLE_ITT
-    ccl::profile::itt::task_end(ccl::profile::itt::task_type::preparation);
-    ccl::profile::itt::task_start(ccl::profile::itt::task_type::device_work);
-#endif // CCL_ENABLE_ITT
-
     if (use_single_list) {
         init_entries();
     }
@@ -153,7 +153,7 @@ void ze_base_entry::start() {
     // in single_list mode globally we have only one list per queue, so execute it elsewhere
     // in non single_list mode each entry execute only own lists
     if ((use_single_list && sched->ze_entries.front() == this &&
-         ze_command::bypass_command_flag()) ||
+         sched->get_ze_commands_bypass_flag()) ||
         !use_single_list) {
         sched_entry::ze_commands_submit();
         sched->get_memory().list_manager->execute(this);
@@ -198,11 +198,6 @@ void ze_base_entry::update() {
         LOG_DEBUG(name(), " ", this, " entry complete");
         status = ccl_sched_entry_status_complete;
 
-#ifdef CCL_ENABLE_ITT
-        ccl::profile::itt::task_end(ccl::profile::itt::task_type::device_work);
-        ccl::profile::itt::task_start(ccl::profile::itt::task_type::completion);
-#endif // CCL_ENABLE_ITT
-
         if (use_single_list) {
             reset_events();
         }
diff --git a/src/sched/entry/ze/ze_command.cpp b/src/sched/entry/ze/ze_command.cpp
index f7cc24a6f..69769ba22 100644
--- a/src/sched/entry/ze/ze_command.cpp
+++ b/src/sched/entry/ze/ze_command.cpp
@@ -18,14 +18,6 @@
 #include "sched/entry/ze/ze_command.hpp"
 #include "common/global/global.hpp"
 
-namespace ze_command {
-
-bool bypass_command_flag() {
-    return ccl::global_data::env().enable_ze_cmd_bypass;
-}
-
-} // namespace ze_command
-
 void ze_cmd_memory_copy::ze_call() {
     ZE_CALL(zeCommandListAppendMemoryCopy,
             (cmdlist, //ze_command_list_handle_t command_list_handle,
@@ -51,6 +43,20 @@ void ze_cmd_barrier::ze_call() {
              ));
 }
 
+void ze_cmd_mem_range_barrier::ze_call() {
+    ZE_CALL(
+        zeCommandListAppendMemoryRangesBarrier,
+        (cmdlist, //ze_command_list_handle_t command_list_handle,
+         range_sizes.size(), //uint32_t numRanges,  [in] number of memory ranges
+         &(range_sizes
+               [0]), // const size_t* pRangerange_sizes, [in][range(0, numRanges)] array of range_sizes of memory range
+         &(ranges[0]), // const void** pRanges, [in][range(0, numRanges)] array of memory ranges
+         signal_event, //ze_event_handle_t signal_event,
+         wait_events.size(), //uint32_t numwait_events,
+         wait_events.data() //ze_event_handle_t *phwait_events)
+         ));
+}
+
 void ze_cmd_wait_on_events::ze_call() {
     ZE_CALL(zeCommandListAppendWaitOnEvents,
             (cmdlist, //ze_command_list_handle_t command_list_handle,
diff --git a/src/sched/entry/ze/ze_command.hpp b/src/sched/entry/ze/ze_command.hpp
index ecd8b29a7..2edd4fabd 100644
--- a/src/sched/entry/ze/ze_command.hpp
+++ b/src/sched/entry/ze/ze_command.hpp
@@ -28,19 +28,19 @@ class ze_command_t {
 using ze_commands_t = std::vector<std::unique_ptr<ze_command_t>>;
 
 #define ZE_APPEND_CALL_TO_ENTRY(base_entry, command, params...) \
-    ze_command::create<command>(base_entry->get_ze_commands(), params);
+    ze_command::create<command>(base_entry->get_ze_commands(), \
+                                base_entry->get_sched()->get_ze_commands_bypass_flag(), \
+                                params);
 
 #define ZE_APPEND_CALL(command, params...) ZE_APPEND_CALL_TO_ENTRY(this, command, params);
 
 namespace ze_command {
 
-bool bypass_command_flag();
-
 template <class CommandType, class... Arguments>
-CommandType* create(const ze_commands_t& ze_commands, Arguments&&... args) {
-    LOG_DEBUG("creating: ", CommandType::class_name(), " command");
+CommandType* create(const ze_commands_t& ze_commands, bool bypass_flag, Arguments&&... args) {
+    LOG_DEBUG("creating: ", CommandType::class_name(), " command. bypass: ", bypass_flag);
 
-    if (bypass_command_flag()) {
+    if (bypass_flag) {
         auto cmd = std::make_unique<CommandType>(std::forward<Arguments>(args)...);
         cmd->ze_call();
         return nullptr;
@@ -139,6 +139,40 @@ class ze_cmd_barrier : public ze_command_t {
     void ze_call() override;
 };
 
+class ze_cmd_mem_range_barrier : public ze_command_t {
+    ze_command_list_handle_t cmdlist{};
+
+    std::vector<size_t> range_sizes;
+    std::vector<const void*> ranges;
+
+    ze_event_handle_t signal_event{};
+    std::vector<ze_event_handle_t> wait_events{};
+
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZECMD_MEMBARRIER";
+    }
+    const char* name() const override {
+        return class_name();
+    }
+
+    ze_cmd_mem_range_barrier() = delete;
+    ze_cmd_mem_range_barrier(ze_command_list_handle_t cmdlist,
+                             const std::vector<size_t>& range_sizes,
+                             const std::vector<const void*>& ranges,
+                             ze_event_handle_t signal_event,
+                             const std::vector<ze_event_handle_t>& wait_events)
+            : cmdlist(cmdlist),
+              range_sizes(range_sizes),
+              ranges(ranges),
+              signal_event(signal_event),
+              wait_events(wait_events) {
+        CCL_THROW_IF_NOT(range_sizes.size() == ranges.size() && !range_sizes.empty());
+    }
+
+    void ze_call() override;
+};
+
 class ze_cmd_wait_on_events : public ze_command_t {
     ze_command_list_handle_t cmdlist{};
     std::vector<ze_event_handle_t> wait_events{};
diff --git a/src/sched/entry/ze/ze_copy_entry.cpp b/src/sched/entry/ze/ze_copy_entry.cpp
index 83c90a5bc..4d18c8d26 100644
--- a/src/sched/entry/ze/ze_copy_entry.cpp
+++ b/src/sched/entry/ze/ze_copy_entry.cpp
@@ -31,7 +31,6 @@ ze_copy_entry::ze_copy_entry(ccl_sched* sched,
                         nullptr /*comm*/,
                         1 /*add_event_count*/,
                         true /*is_nonblocking*/),
-          sched(sched),
           in_buf(in_buf),
           out_buf(out_buf),
           dtype(dtype),
@@ -41,15 +40,20 @@ ze_copy_entry::ze_copy_entry(ccl_sched* sched,
 }
 
 void ze_copy_entry::init_ze_hook() {
+    int peer_rank = attr.peer_rank;
+    if (attr.pt2pt_op) {
+        peer_rank = ccl::ze::ipc_handle_manager::pt2pt_handles_size - 1;
+    }
+
     if (attr.peer_rank != ccl_comm::invalid_rank) {
         if (!out_buf) {
             sched->get_memory().handle_manager.get(
-                attr.peer_rank, attr.peer_buf_idx, out_buf, attr.map_comm);
+                peer_rank, attr.peer_buf_idx, out_buf, attr.map_comm, attr.pt2pt_op);
         }
 
         if (!in_buf) {
             sched->get_memory().handle_manager.get(
-                attr.peer_rank, attr.peer_buf_idx, in_buf, attr.map_comm);
+                peer_rank, attr.peer_buf_idx, in_buf, attr.map_comm, attr.pt2pt_op);
         }
     }
 
diff --git a/src/sched/entry/ze/ze_copy_entry.hpp b/src/sched/entry/ze/ze_copy_entry.hpp
index f5a7c317b..706bf46e3 100644
--- a/src/sched/entry/ze/ze_copy_entry.hpp
+++ b/src/sched/entry/ze/ze_copy_entry.hpp
@@ -42,7 +42,6 @@ class ze_copy_entry : public ze_base_entry {
     void init_ze_hook() override;
 
 private:
-    ccl_sched* const sched;
     ccl_buffer in_buf{};
     ccl_buffer out_buf{};
     const ccl_datatype dtype;
diff --git a/src/sched/entry/ze/ze_execute_cmdlists_entry.hpp b/src/sched/entry/ze/ze_execute_cmdlists_entry.hpp
index 969870f62..32ae4d768 100644
--- a/src/sched/entry/ze/ze_execute_cmdlists_entry.hpp
+++ b/src/sched/entry/ze/ze_execute_cmdlists_entry.hpp
@@ -18,25 +18,40 @@
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/entry/ze/ze_base_entry.hpp"
 
-class execute_cmdlists_entry : public ze_base_entry {
+// These entries, ze_execute_cmdlists_on_*_entry, are used to submit commands
+//   to their respective command lists/queues, execute them, and mark the
+//   schedule as 'submitted to GPU'.
+//
+// The difference between both, lies on the moment when these steps are taken:
+//  - on_init:  similar to when init_ze_hook() is executed; i.e., upon start()
+//              of the first ze_base_entry of the schedule.
+//  - on_start: upon start() of this entry.
+//
+// Typically, algorithms will want to use on_init. Only in specific cases,
+//  such as when ze_commands are cached, reordered, dynamically submitted,
+//  etc., on_start should be used.
+
+class ze_execute_cmdlists_on_init_entry : public ze_base_entry {
 public:
     static constexpr const char* class_name() noexcept {
-        return "EXEC_CMDLIST";
+        return "ZEEXEC_CMDLIST_INIT";
     }
     const char* name() const override {
         return class_name();
     }
 
-    execute_cmdlists_entry(ccl_sched* sched) : ze_base_entry(sched, {}) {}
+    ze_execute_cmdlists_on_init_entry(ccl_sched* sched) : ze_base_entry(sched, {}) {}
 
     void init() override {
         LOG_DEBUG("execute cmdlists entry");
-        if (sched->use_single_list && !ze_command::bypass_command_flag()) {
+        if (sched->use_single_list && !sched->get_ze_commands_bypass_flag()) {
             // submit commands to command lists
-            sched->ze_commands_submit();
+            int cmd_counter = sched->ze_commands_submit();
 
-            // once command lists have commands, execute their associated cmdqueues
-            sched->get_memory().list_manager->execute(this);
+            if (cmd_counter) {
+                // once command lists have commands, execute their associated cmdqueues
+                sched->get_memory().list_manager->execute(this);
+            }
 
             sched->set_submitted_to_gpu(true);
         }
@@ -48,3 +63,72 @@ class execute_cmdlists_entry : public ze_base_entry {
         ze_base_entry::update();
     }
 };
+
+typedef uint32_t (*ze_commands_submit_function_t)(ccl_sched*);
+
+class ze_execute_cmdlists_on_start_entry : public sched_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZEEXEC_CMDLIST_START";
+    }
+    const char* name() const override {
+        return class_name();
+    }
+
+    ze_execute_cmdlists_on_start_entry(ccl_sched* sched,
+                                       std::shared_ptr<sync_object> sync_obj = nullptr,
+                                       ze_commands_submit_function_t submit_fn = nullptr)
+            : sched_entry(sched,
+                          false /*is_barrier*/,
+                          false /*is_urgent*/,
+                          false /*is_nonblocking*/),
+              sync_obj(std::move(sync_obj)),
+              submit_fn(submit_fn) {}
+
+    void start() override {
+        status = ccl_sched_entry_status_started;
+    }
+
+    void update() override {
+        if (sync_obj && sync_obj->value() > 0) {
+            return;
+        }
+
+        if (sched->use_single_list && !sched->get_ze_commands_bypass_flag()) {
+            if (!commands_submitted_flag) {
+                // submit commands to command lists
+                LOG_DEBUG("submit commands to device");
+                if (submit_fn) {
+                    cmd_counter = submit_fn(sched);
+                }
+                else {
+                    cmd_counter = sched->ze_commands_submit();
+                }
+                commands_submitted_flag = true;
+            }
+
+            if (cmd_counter > 0) {
+                // once command lists have commands, execute their associated cmdqueues
+                LOG_DEBUG("execute command lists. cmd_counter: ", cmd_counter);
+                sched->get_memory().list_manager->execute(this);
+            }
+
+            sched->set_submitted_to_gpu(true);
+        }
+
+        status = ccl_sched_entry_status_complete;
+    }
+
+    void reset(size_t idx) override {
+        sched_entry::reset(idx);
+        if (sync_obj) {
+            sync_obj->reset();
+        }
+    }
+
+private:
+    std::shared_ptr<sync_object> sync_obj;
+    ze_commands_submit_function_t submit_fn;
+    uint32_t cmd_counter{};
+    bool commands_submitted_flag{ false };
+};
diff --git a/src/sched/entry/ze/ze_handle_exchange_entry.cpp b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
index ec5b05d49..67c13c259 100644
--- a/src/sched/entry/ze/ze_handle_exchange_entry.cpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.cpp
@@ -29,16 +29,19 @@ static void cast_pool_to_mem_handle(ze_ipc_mem_handle_t* mem,
     memcpy(mem, pool, sizeof(*pool));
 }
 
-ze_handle_exchange_entry::ze_handle_exchange_entry(ccl_sched* sched,
-                                                   ccl_comm* comm,
-                                                   const std::vector<mem_desc_t>& in_buffers,
-                                                   int skip_rank)
+ze_handle_exchange_entry::ze_handle_exchange_entry(
+    ccl_sched* sched,
+    ccl_comm* comm,
+    const std::vector<mem_desc_t>& in_buffers,
+    int skip_rank,
+    ccl::utils::pt2pt_handle_exchange_info pt2pt_info)
         : sched_entry(sched, false /*is_barrier*/, true /*is_urgent*/),
           comm(comm),
           in_buffers(in_buffers),
           rank(comm->rank()),
           comm_size(comm->size()),
-          skip_rank(skip_rank) {
+          skip_rank(skip_rank),
+          pt2pt_info(pt2pt_info) {
     LOG_DEBUG("init");
 
     CCL_THROW_IF_NOT(sched, "no sched");
@@ -79,19 +82,19 @@ void ze_handle_exchange_entry::start() {
     start_buf_idx = start_peer_idx = 0;
     skip_first_send = false;
     status = ccl_sched_entry_status_started;
-
     if (comm_size == 1) {
         status = ccl_sched_entry_status_complete;
     }
 }
 
 uint32_t ze_handle_exchange_entry::get_remote_device_id(ccl::ze::device_info& info) {
-    auto idx = info.physical_idx;
+    int idx = info.physical_idx;
     if (!ccl::global_data::env().ze_drm_bdf_support ||
-        (int)idx == ccl::ze::fd_manager::invalid_physical_idx) {
-        idx = info.parent_idx;
+        idx == ccl::ze::fd_manager::invalid_physical_idx) {
+        idx = info.parent_idx; // unsigned -> signed conversion
     }
-    return idx;
+    CCL_THROW_IF_NOT(idx >= 0, "invalid device index conversion");
+    return static_cast<uint32_t>(idx);
 }
 
 void ze_handle_exchange_entry::create_local_ipc_handles(const std::vector<mem_desc_t>& bufs) {
@@ -102,7 +105,12 @@ void ze_handle_exchange_entry::create_local_ipc_handles(const std::vector<mem_de
     // current pid of current rank
     current_pid = getpid();
 
-    handles.resize(comm_size);
+    int handles_size = comm_size;
+    if (sched->coll_param.ctype == ccl_coll_send || sched->coll_param.ctype == ccl_coll_recv) {
+        handles_size = ccl::ze::ipc_handle_manager::pt2pt_handles_size;
+    }
+
+    handles.resize(handles_size);
     for (auto& buffers : handles) {
         buffers.resize(in_buffers.size());
     }
@@ -157,8 +165,8 @@ void ze_handle_exchange_entry::create_local_ipc_handles(const std::vector<mem_de
                 CCL_THROW("unknown memory type");
             }
         }
-
-        handles[rank][buf_idx] = { ipc_handle, mem_info.second, mem_type, mem_handle };
+        int handle_idx = get_handle_idx(sched->coll_param.ctype, rank);
+        handles[handle_idx][buf_idx] = { ipc_handle, mem_info.second, mem_type, mem_handle };
         LOG_DEBUG("set IPC handle: { rank: ",
                   rank,
                   ", buf_idx: ",
@@ -199,9 +207,10 @@ int ze_handle_exchange_entry::ipc_to_mem_handle(const ze_ipc_mem_handle_t& ipc_h
 void ze_handle_exchange_entry::fill_payload(payload_t& payload,
                                             const std::vector<mem_desc_t>& bufs,
                                             size_t buf_idx) {
-    payload.mem_handle = handles[rank][buf_idx].mem_handle;
-    payload.mem_type = handles[rank][buf_idx].mem_type;
-    payload.mem_offset = handles[rank][buf_idx].mem_offset;
+    int handle_idx = get_handle_idx(sched->coll_param.ctype, rank);
+    payload.mem_handle = handles[handle_idx][buf_idx].mem_handle;
+    payload.mem_type = handles[handle_idx][buf_idx].mem_type;
+    payload.mem_offset = handles[handle_idx][buf_idx].mem_offset;
     payload.remote_pid = getpid();
     const void* ptr = bufs[buf_idx].first;
     if (ptr == nullptr) {
@@ -252,6 +261,30 @@ void ze_handle_exchange_entry::fill_remote_handle(const payload_t& payload,
               " }");
 }
 
+int ze_handle_exchange_entry::get_remote_physical_device_fd(const ssize_t remote_device_id) {
+    int ret = ccl::utils::invalid_device_id;
+    // empty buffers should contain remote_device_id == -1 and empty physical_devices
+    if (remote_device_id >= 0 && physical_devices.size() > 0) {
+        // buffers are not empty, continue filling the payload
+        auto& devices = ccl::global_data::get().ze_data->devices;
+        CCL_THROW_IF_NOT(static_cast<size_t>(remote_device_id) < devices.size(),
+                         "remote_device_id [",
+                         remote_device_id,
+                         "] out of range [0; ",
+                         devices.size(),
+                         ")");
+        auto remote_device = devices[remote_device_id];
+        uint32_t remote_physical_device_id = get_remote_device_id(remote_device);
+        CCL_THROW_IF_NOT(remote_physical_device_id < physical_devices.size(),
+                         "remote_physical_device_id [",
+                         remote_physical_device_id,
+                         "] is invalid, >= ",
+                         physical_devices.size());
+        ret = physical_devices[remote_physical_device_id].fd;
+    } // else: buffer is empty, physical device is invalid
+    return ret;
+}
+
 void ze_handle_exchange_entry::common_fd_mode_exchange(const std::vector<mem_desc_t>& bufs) {
     for (size_t buf_idx = 0; buf_idx < bufs.size(); buf_idx++) {
         std::vector<payload_t> payloads(comm_size);
@@ -269,12 +302,8 @@ void ze_handle_exchange_entry::common_fd_mode_exchange(const std::vector<mem_des
             }
 
             if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
-                auto remote_device_id = payloads[idx].remote_device_id;
-                auto device = ccl::global_data::get().ze_data->devices[remote_device_id];
-                // empty buffers do not have devices specified
-                payloads[idx].device_fd = (physical_devices.size() > 0)
-                                              ? physical_devices[get_remote_device_id(device)].fd
-                                              : -1;
+                payloads[idx].device_fd =
+                    get_remote_physical_device_fd(payloads[idx].remote_device_id);
             }
             else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
                 opened_pidfds.push_back(ccl::ze::fd_manager::pidfd_open(payloads[idx].remote_pid));
@@ -299,6 +328,79 @@ void ze_handle_exchange_entry::common_fd_mode_exchange(const std::vector<mem_des
     sched->get_memory().handle_manager.set(handles);
 }
 
+void ze_handle_exchange_entry::pt2pt_fd_mode_exchange(const std::vector<mem_desc_t>& bufs) {
+    int peer_rank = pt2pt_info.peer_rank;
+    int pt2pt_sched_id = comm->get_atl_comm()->tag_creator->get_pt2pt_sched_id();
+
+    LOG_DEBUG("pt2pt_fd_mode_exchange is chosen: bufs size: ",
+              bufs.size(),
+              ", rank: ",
+              rank,
+              ", peer_rank: ",
+              peer_rank);
+    for (size_t buf_idx = 0; buf_idx < bufs.size(); buf_idx++) {
+        std::vector<payload_t> payloads(1);
+        payload_t payload{};
+        fill_payload(payload, bufs, buf_idx);
+
+        if (pt2pt_info.role == ccl::utils::pt2pt_handle_exchange_role::sender) {
+            ccl::utils::send(
+                comm->get_atl_comm(), &payload, sizeof(payload_t), peer_rank, pt2pt_sched_id);
+            LOG_DEBUG("send: from rank: ",
+                      rank,
+                      " to peer_rank: ",
+                      peer_rank,
+                      ", payload: ",
+                      &payload,
+                      ", size msg: ",
+                      sizeof(payload_t));
+        }
+        else if (pt2pt_info.role == ccl::utils::pt2pt_handle_exchange_role::receiver) {
+            ccl::utils::recv(comm->get_atl_comm(),
+                             payloads.data(),
+                             sizeof(payload_t),
+                             peer_rank,
+                             pt2pt_sched_id);
+            LOG_DEBUG("recv: from rank: ",
+                      peer_rank,
+                      " in rank: ",
+                      rank,
+                      ", payloads: ",
+                      &payload,
+                      ", size msg: ",
+                      sizeof(payload_t));
+
+            for (size_t idx = 0; idx < payloads.size(); idx++) {
+                if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd) {
+                    payloads[idx].device_fd =
+                        get_remote_physical_device_fd(payloads[idx].remote_device_id);
+                }
+                else if (ccl::global_data::env().ze_ipc_exchange ==
+                         ccl::ze::ipc_exchange_mode::pidfd) {
+                    opened_pidfds.push_back(
+                        ccl::ze::fd_manager::pidfd_open(payloads[idx].remote_pid));
+                    payloads[idx].pidfd_fd = opened_pidfds.back();
+                }
+                else {
+                    CCL_THROW("unexpected ipc_exchange_mode for pt2pt");
+                }
+                fill_remote_handle(
+                    payloads[idx],
+                    {}, // ipc_handle is empty, it's initialized immeadiately before calling zeMemOpenIpcHandle
+                    idx,
+                    buf_idx);
+            }
+        }
+        else {
+            CCL_THROW_IF_NOT("unexpected pt2pt_handle_exchange_mode,"
+                             " could not idetify the role of rank");
+        }
+    }
+
+    LOG_DEBUG("pt2pt_fd_mode_exchange is finished");
+    sched->get_memory().handle_manager.set(handles, true);
+}
+
 int ze_handle_exchange_entry::sockets_mode_exchange(const std::vector<mem_desc_t>& bufs) {
     if (!is_created) {
         // server
@@ -425,18 +527,29 @@ int ze_handle_exchange_entry::sockets_mode_exchange(const std::vector<mem_desc_t
 
 void ze_handle_exchange_entry::update() {
     if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::sockets) {
+        if (sched->coll_param.ctype == ccl_coll_send || sched->coll_param.ctype == ccl_coll_recv) {
+            CCL_THROW(
+                "sockets ipc_exchange_mode is not supported for pt2pt operations, use drmfd or pidfd");
+        }
+
         if (sockets_mode_exchange(in_buffers)) {
             return;
         }
     }
     else if (ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::drmfd ||
              ccl::global_data::env().ze_ipc_exchange == ccl::ze::ipc_exchange_mode::pidfd) {
-        common_fd_mode_exchange(in_buffers);
+        if (sched->coll_param.ctype == ccl_coll_send || sched->coll_param.ctype == ccl_coll_recv) {
+            pt2pt_fd_mode_exchange(in_buffers);
+        }
+        else {
+            common_fd_mode_exchange(in_buffers);
+        }
     }
     else {
         CCL_THROW("unexpected ipc_exchange_mode");
     }
     status = ccl_sched_entry_status_complete;
+
     LOG_DEBUG("completed: ", name());
 }
 
@@ -577,6 +690,14 @@ void ze_handle_exchange_entry::close_sockets() {
     }
 }
 
+int ze_handle_exchange_entry::get_handle_idx(ccl_coll_type ctype, int rank_arg) {
+    int idx = rank_arg;
+    if (ctype == ccl_coll_send || ctype == ccl_coll_recv) {
+        idx = 0;
+    }
+    return idx;
+}
+
 void ze_handle_exchange_entry::dump_detail(std::stringstream& str) const {
     ccl_logger::format(str,
                        "comm ",
diff --git a/src/sched/entry/ze/ze_handle_exchange_entry.hpp b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
index 018eb58fa..2512d1c90 100644
--- a/src/sched/entry/ze/ze_handle_exchange_entry.hpp
+++ b/src/sched/entry/ze/ze_handle_exchange_entry.hpp
@@ -41,8 +41,12 @@ class ze_handle_exchange_entry : public sched_entry {
     explicit ze_handle_exchange_entry(ccl_sched* sched,
                                       ccl_comm* comm,
                                       const std::vector<mem_desc_t>& in_buffers,
-                                      int skip_rank = -1);
+                                      int skip_rank,
+                                      // only for pt2pt usage:
+                                      ccl::utils::pt2pt_handle_exchange_info pt2pt_info);
     ~ze_handle_exchange_entry();
+    ze_handle_exchange_entry& operator=(const ze_handle_exchange_entry&) = delete;
+    ze_handle_exchange_entry(const ze_handle_exchange_entry&) = delete;
 
     void start() override;
     void update() override;
@@ -63,6 +67,8 @@ class ze_handle_exchange_entry : public sched_entry {
     const int rank;
     const int comm_size;
     int skip_rank;
+    ccl::utils::pt2pt_handle_exchange_info pt2pt_info;
+
     pid_t current_pid = ccl::utils::invalid_pid;
 
     int start_buf_idx{};
@@ -108,6 +114,7 @@ class ze_handle_exchange_entry : public sched_entry {
     void create_local_ipc_handles(const std::vector<mem_desc_t>& bufs);
     int sockets_mode_exchange(const std::vector<mem_desc_t>& bufs);
     void common_fd_mode_exchange(const std::vector<mem_desc_t>& bufs);
+    void pt2pt_fd_mode_exchange(const std::vector<mem_desc_t>& bufs);
 
     bool is_created{};
     bool is_connected{};
@@ -141,4 +148,6 @@ class ze_handle_exchange_entry : public sched_entry {
     void close_sockets();
 
     uint32_t get_remote_device_id(ccl::ze::device_info& info);
+    int get_remote_physical_device_fd(const ssize_t remote_device_id);
+    int get_handle_idx(ccl_coll_type ctype, int rank_arg);
 };
diff --git a/src/sched/entry/ze/ze_kernel.cpp b/src/sched/entry/ze/ze_kernel.cpp
index 41ff249ca..7fa6e9d32 100644
--- a/src/sched/entry/ze/ze_kernel.cpp
+++ b/src/sched/entry/ze/ze_kernel.cpp
@@ -67,9 +67,14 @@ ze_kernel::ze_kernel(ze_kernel &&other) noexcept
     other.kernel = nullptr;
 };
 
-ze_kernel::~ze_kernel() {
-    if (kernel) {
-        global_data::get().ze_data->cache->push(worker_idx, module, kernel_name, kernel);
+ze_kernel::~ze_kernel() noexcept {
+    try {
+        if (kernel) {
+            global_data::get().ze_data->cache->push(worker_idx, module, kernel_name, kernel);
+        }
+    }
+    catch (...) {
+        LOG_ERROR("error pushing to the kernel cache");
     }
 }
 
diff --git a/src/sched/entry/ze/ze_kernel.hpp b/src/sched/entry/ze/ze_kernel.hpp
index 43074ce48..4c0a91dbd 100644
--- a/src/sched/entry/ze/ze_kernel.hpp
+++ b/src/sched/entry/ze/ze_kernel.hpp
@@ -39,7 +39,7 @@ class ze_kernel {
 
     ze_kernel(const ze_kernel &) = delete;
     ze_kernel(ze_kernel &&other) noexcept;
-    ~ze_kernel();
+    ~ze_kernel() noexcept;
 
 private:
     void actually_call_ze(ze_command_list_handle_t list,
diff --git a/src/sched/entry/ze/ze_membarrier_entry.cpp b/src/sched/entry/ze/ze_membarrier_entry.cpp
new file mode 100644
index 000000000..e14550af2
--- /dev/null
+++ b/src/sched/entry/ze/ze_membarrier_entry.cpp
@@ -0,0 +1,57 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "common/api_wrapper/ze_api_wrapper.hpp"
+#include "sched/entry/ze/ze_membarrier_entry.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+ze_membarrier_entry::ze_membarrier_entry(ccl_sched* sched,
+                                         size_t range_size,
+                                         const void* range,
+                                         const std::vector<ze_event_handle_t>& wait_events)
+        : ze_membarrier_entry(sched,
+                              std::vector<size_t>({ range_size }),
+                              std::vector<const void*>({ range }),
+                              wait_events) {}
+
+ze_membarrier_entry::ze_membarrier_entry(ccl_sched* sched,
+                                         const std::vector<size_t>& range_sizes,
+                                         const std::vector<const void*>& ranges,
+                                         const std::vector<ze_event_handle_t>& wait_events)
+        : ze_base_entry(sched,
+                        wait_events,
+                        nullptr /*comm*/,
+                        1 /*add_event_count*/,
+                        true /*is_nonblocking*/),
+          range_sizes(range_sizes),
+          ranges(ranges) {
+    CCL_THROW_IF_NOT(sched, "no sched");
+}
+
+void ze_membarrier_entry::init_ze_hook() {
+    ze_command_list_handle_t list = ze_base_entry::get_copy_list();
+
+    ZE_APPEND_CALL(ze_cmd_mem_range_barrier,
+                   list,
+                   range_sizes,
+                   ranges,
+                   ze_base_entry::entry_event,
+                   wait_events);
+}
+
+std::string ze_membarrier_entry::name_ext() const {
+    return name();
+}
diff --git a/src/sched/entry/ze/ze_membarrier_entry.hpp b/src/sched/entry/ze/ze_membarrier_entry.hpp
new file mode 100644
index 000000000..261485a08
--- /dev/null
+++ b/src/sched/entry/ze/ze_membarrier_entry.hpp
@@ -0,0 +1,47 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/factory/entry_factory.hpp"
+
+class ze_membarrier_entry : public ze_base_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_MEMBARRIER";
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override;
+
+    explicit ze_membarrier_entry(ccl_sched* sched,
+                                 const std::vector<size_t>& range_sizes,
+                                 const std::vector<const void*>& ranges,
+                                 const std::vector<ze_event_handle_t>& wait_events = {});
+
+    explicit ze_membarrier_entry(ccl_sched* sched,
+                                 size_t size,
+                                 const void* range,
+                                 const std::vector<ze_event_handle_t>& wait_events = {});
+
+    void init_ze_hook() override;
+
+private:
+    std::vector<size_t> range_sizes;
+    std::vector<const void*> ranges;
+};
diff --git a/src/sched/entry/ze/ze_onesided_reduce_entry.cpp b/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
index 12e006ebd..a9bc504a5 100644
--- a/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
+++ b/src/sched/entry/ze/ze_onesided_reduce_entry.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/stream/stream.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_onesided_reduce_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 #include "sched/queue/queue.hpp"
@@ -145,11 +145,11 @@ void ze_onesided_reduce_entry::init_ze_hook() {
     }
 
     LOG_DEBUG("ze_onesided_reduce_entry with aligned monolithic kernels");
-    // use recv_buf_ptr instead of right_recv_buf_ptr since we cannot make sure
-    // if right_recv_buf_ptr got using ipc has the same alignment as remote recv_buf_ptr.
-    // we assume local recv_buf_ptr and remote recv_buf_ptr has the same alignment
+    // use send_buf_ptr instead of right_send_buf_ptr since we cannot make sure
+    // if right_send_buf_ptr got using ipc has the same alignment as remote send_buf_ptr.
+    // we assume local send_buf_ptr and remote send_buf_ptr has the same alignment
     unsigned long pre_align_offset_byte = ccl::utils::get_aligned_offset_byte(
-        recv_buf_ptr, buf_size_bytes, ccl::global_data::env().kernel_mem_align);
+        send_buf_ptr, buf_size_bytes, ccl::global_data::env().kernel_mem_align);
 
     // first kernel starts from location 0 to pre_align_offset_byte
     // and the second kernel starts from location pre_align_offset_byte to the rest
diff --git a/src/sched/entry/ze/ze_onesided_reduce_entry.hpp b/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
index 6d11cb33a..9cd3debd5 100644
--- a/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
+++ b/src/sched/entry/ze/ze_onesided_reduce_entry.hpp
@@ -56,11 +56,11 @@ class ze_onesided_reduce_entry : public ze_base_entry {
     void dump_detail(std::stringstream& str) const override;
 
 private:
-    ccl_buffer send_buf;
-    ccl_buffer recv_buf;
-    void* send_buf_ptr;
-    void* recv_buf_ptr;
-    void* right_send_buf_ptr;
+    ccl_buffer send_buf{};
+    ccl_buffer recv_buf{};
+    void* send_buf_ptr{};
+    void* recv_buf_ptr{};
+    void* right_send_buf_ptr{};
     const unsigned long cnt;
     const ccl_datatype dtype;
     const ccl::reduction op;
@@ -68,12 +68,12 @@ class ze_onesided_reduce_entry : public ze_base_entry {
     const size_t buf_size_bytes;
     const size_t peer_buf_offset_bytes;
 
-    ze_event_handle_t empty_kernel_event;
-    ze_event_handle_t copy_from_peer_event;
+    ze_event_handle_t empty_kernel_event{};
+    ze_event_handle_t copy_from_peer_event{};
 
-    std::string main_kernel_name;
+    std::string main_kernel_name{};
 
-    std::string empty_kernel_name;
+    std::string empty_kernel_name{};
 
     bool skip_entry{};
 };
diff --git a/src/sched/entry/ze/ze_primitives.cpp b/src/sched/entry/ze/ze_primitives.cpp
index 0d67130e3..2a4275b5d 100644
--- a/src/sched/entry/ze/ze_primitives.cpp
+++ b/src/sched/entry/ze/ze_primitives.cpp
@@ -336,6 +336,7 @@ device_family get_device_family(ze_device_handle_t device) {
     switch (id) {
         case static_cast<enum_t>(device_id::id1): return device_family::family1;
         case static_cast<enum_t>(device_id::id2): return device_family::family2;
+        case static_cast<enum_t>(device_id::id3): return device_family::family3;
         default: return device_family::unknown;
     }
 }
@@ -345,11 +346,11 @@ bool is_same_pci_addr(const zes_pci_address_t& addr1, const zes_pci_address_t& a
     if (!(addr1.domain == addr2.domain && addr1.bus == addr2.bus && addr1.device == addr2.device &&
           addr1.function == addr2.function)) {
         result = false;
-        LOG_DEBUG("pci addresses are not the same:"
-                  " addr1: ",
-                  ccl::ze::to_string(addr1),
-                  " addr2: ",
-                  ccl::ze::to_string(addr2));
+        //LOG_DEBUG("pci addresses are not the same:"
+        //          " addr1: ",
+        //          ccl::ze::to_string(addr1),
+        //          " addr2: ",
+        //          ccl::ze::to_string(addr2));
     }
     return result;
 }
@@ -427,6 +428,43 @@ bool fabric_port_comparator::operator()(const zes_fabric_port_id_t& a,
     }
 }
 
+std::string to_string(ze_event_scope_flag_t scope_flag) {
+    switch (scope_flag) {
+        case ZE_EVENT_SCOPE_FLAG_SUBDEVICE: return "ZE_EVENT_SCOPE_FLAG_SUBDEVICE";
+        case ZE_EVENT_SCOPE_FLAG_DEVICE: return "ZE_EVENT_SCOPE_FLAG_DEVICE";
+        case ZE_EVENT_SCOPE_FLAG_HOST: return "ZE_EVENT_SCOPE_FLAG_HOST";
+        default:
+            return "unknown ze_event_scope_flag_t value: " +
+                   std::to_string(static_cast<uint32_t>(scope_flag));
+    }
+}
+
+std::string to_string(ze_event_scope_flags_t _scope_flags) {
+    auto scope_flags = _scope_flags;
+    std::string out;
+    while (scope_flags) {
+        if (out.size())
+            out += "|";
+        if (scope_flags & ZE_EVENT_SCOPE_FLAG_SUBDEVICE) {
+            out += to_string(ZE_EVENT_SCOPE_FLAG_SUBDEVICE);
+            scope_flags &= ~ZE_EVENT_SCOPE_FLAG_SUBDEVICE;
+        }
+        else if (scope_flags & ZE_EVENT_SCOPE_FLAG_DEVICE) {
+            out += to_string(ZE_EVENT_SCOPE_FLAG_DEVICE);
+            scope_flags &= ~ZE_EVENT_SCOPE_FLAG_DEVICE;
+        }
+        else if (scope_flags & ZE_EVENT_SCOPE_FLAG_HOST) {
+            out += to_string(ZE_EVENT_SCOPE_FLAG_HOST);
+            scope_flags &= ~ZE_EVENT_SCOPE_FLAG_HOST;
+        }
+        else {
+            return "unknown ze_event_scope_flag_t value: " +
+                   std::to_string(static_cast<uint32_t>(_scope_flags));
+        }
+    }
+    return out;
+}
+
 std::string to_string(ze_result_t result) {
     switch (result) {
         case ZE_RESULT_SUCCESS: return "ZE_RESULT_SUCCESS";
diff --git a/src/sched/entry/ze/ze_primitives.hpp b/src/sched/entry/ze/ze_primitives.hpp
index 7290fddfa..60d7de9f7 100644
--- a/src/sched/entry/ze/ze_primitives.hpp
+++ b/src/sched/entry/ze/ze_primitives.hpp
@@ -29,7 +29,7 @@ namespace ze {
 
 #define ZE_CALL(ze_name, ze_args) ccl::ze::ze_call().do_call(ze_name ze_args, #ze_name)
 
-enum class device_id : uint32_t { unknown = 0x0, id1 = 0x200, id2 = 0xbd0 };
+enum class device_id : uint32_t { unknown = 0x0, id1 = 0x200, id2 = 0xbd0, id3 = 0xb60 };
 
 enum class copy_engine_mode { none, main, link, auto_mode };
 enum class h2d_copy_engine_mode { none, main, auto_mode };
@@ -77,12 +77,33 @@ constexpr ze_device_mem_alloc_desc_t default_device_mem_alloc_desc = {
 constexpr ze_memory_allocation_properties_t default_alloc_props = {
     .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
     .pNext = nullptr,
-    .type = ZE_MEMORY_TYPE_UNKNOWN
+    .type = ZE_MEMORY_TYPE_UNKNOWN,
+    .id = 0,
+    .pageSize = 0
 };
 
 constexpr ze_device_properties_t default_device_props = { .stype =
                                                               ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES,
-                                                          .pNext = nullptr };
+                                                          .pNext = nullptr,
+                                                          .type = ZE_DEVICE_TYPE_GPU,
+                                                          .vendorId = 0,
+                                                          .deviceId = 0,
+                                                          .flags = 0,
+                                                          .subdeviceId = 0,
+                                                          .coreClockRate = 0,
+                                                          .maxMemAllocSize = 0,
+                                                          .maxHardwareContexts = 0,
+                                                          .maxCommandQueuePriority = 0,
+                                                          .numThreadsPerEU = 0,
+                                                          .physicalEUSimdWidth = 0,
+                                                          .numEUsPerSubslice = 0,
+                                                          .numSubslicesPerSlice = 0,
+                                                          .numSlices = 0,
+                                                          .timerResolution = 0,
+                                                          .timestampValidBits = 0,
+                                                          .kernelTimestampValidBits = 0,
+                                                          .uuid = {},
+                                                          .name = {} };
 
 constexpr ze_event_pool_desc_t default_event_pool_desc = { .stype =
                                                                ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
@@ -193,6 +214,8 @@ struct fabric_port_comparator {
     bool operator()(const zes_fabric_port_id_t& a, const zes_fabric_port_id_t& b) const;
 };
 
+std::string to_string(ze_event_scope_flag_t scope_flag);
+std::string to_string(ze_event_scope_flags_t scope_flags);
 std::string to_string(ze_result_t result);
 std::string to_string(const ze_group_size_t& group_size);
 std::string to_string(const ze_group_count_t& group_count);
diff --git a/src/sched/entry/ze/ze_pt2pt_barrier_entry.cpp b/src/sched/entry/ze/ze_pt2pt_barrier_entry.cpp
new file mode 100644
index 000000000..839543552
--- /dev/null
+++ b/src/sched/entry/ze/ze_pt2pt_barrier_entry.cpp
@@ -0,0 +1,69 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#include "sched/entry/coll/direct/base_coll_entry.hpp"
+#include "sched/entry/ze/ze_pt2pt_barrier_entry.hpp"
+#include "sched/entry/ze/ze_primitives.hpp"
+
+#include <string>
+#include <tuple>
+
+using namespace ccl;
+using namespace ccl::ze;
+
+ze_pt2pt_barrier_entry::ze_pt2pt_barrier_entry(ccl_sched* sched, ccl_comm* comm, int peer_rank)
+        : sched_entry(sched),
+          comm(comm),
+          peer_rank(peer_rank) {}
+
+void ze_pt2pt_barrier_entry::start() {
+    LOG_DEBUG("ze_pt2pt_barrier_entry is strated");
+    status = ccl_sched_entry_status_started;
+}
+
+void ze_pt2pt_barrier_entry::update() {
+    ccl_sched_id_t pt2pt_ack_first, pt2pt_ack_second;
+    std::tie(pt2pt_ack_first, pt2pt_ack_second) =
+        comm->get_atl_comm()->tag_creator->get_pt2pt_sync_tags();
+
+    if (sched->coll_param.ctype == ccl_coll_recv) {
+        uint64_t ack_tag_first = comm->get_atl_comm()->tag_creator->create(
+            comm->rank(), comm->get_comm_id(), pt2pt_ack_first, sched->get_op_id());
+        uint64_t ack_tag_second = comm->get_atl_comm()->tag_creator->create(
+            peer_rank, comm->get_comm_id(), pt2pt_ack_second, sched->get_op_id());
+
+        ccl::utils::send_ack_to_peer(comm->get_atl_comm(), ack_tag_first, peer_rank);
+        ccl::utils::recv_ack_from_peer(comm->get_atl_comm(), ack_tag_second, peer_rank);
+        LOG_DEBUG("recv side: first_tag: ", ack_tag_first, ", second_tag: ", ack_tag_second);
+    }
+    if (sched->coll_param.ctype == ccl_coll_send) {
+        uint64_t ack_tag_first = comm->get_atl_comm()->tag_creator->create(
+            peer_rank, comm->get_comm_id(), pt2pt_ack_first, sched->get_op_id());
+        uint64_t ack_tag_second = comm->get_atl_comm()->tag_creator->create(
+            comm->rank(), comm->get_comm_id(), pt2pt_ack_second, sched->get_op_id());
+
+        ccl::utils::recv_ack_from_peer(comm->get_atl_comm(), ack_tag_first, peer_rank);
+        ccl::utils::send_ack_to_peer(comm->get_atl_comm(), ack_tag_second, peer_rank);
+        LOG_DEBUG("send side: first_tag: ", ack_tag_first, ", second_tag: ", ack_tag_second);
+    }
+    LOG_DEBUG("ze_pt2pt_barrier_entry is complete");
+    status = ccl_sched_entry_status_complete;
+}
+
+std::string ze_pt2pt_barrier_entry::name_ext() const {
+    std::stringstream out;
+    out << name() << "rank:" << comm->rank() << ", peer_rank: " << peer_rank;
+    return out.str();
+}
diff --git a/src/sched/entry/ze/ze_pt2pt_barrier_entry.hpp b/src/sched/entry/ze/ze_pt2pt_barrier_entry.hpp
new file mode 100644
index 000000000..581a15398
--- /dev/null
+++ b/src/sched/entry/ze/ze_pt2pt_barrier_entry.hpp
@@ -0,0 +1,41 @@
+/*
+ Copyright 2016-2020 Intel Corporation
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+#pragma once
+
+#include "sched/entry/entry.hpp"
+#include "sched/entry/ze/ze_base_entry.hpp"
+
+class ze_pt2pt_barrier_entry : public sched_entry {
+public:
+    static constexpr const char* class_name() noexcept {
+        return "ZE_PT2PT_BARRIER";
+    }
+
+    const char* name() const override {
+        return class_name();
+    }
+
+    virtual std::string name_ext() const override;
+
+    explicit ze_pt2pt_barrier_entry(ccl_sched* sched, ccl_comm* comm, int peer_rank);
+
+    void start() override;
+    void update() override;
+
+private:
+    ccl_comm* comm;
+    int peer_rank;
+};
diff --git a/src/sched/entry/ze/ze_reduce_local_entry.cpp b/src/sched/entry/ze/ze_reduce_local_entry.cpp
index b5b6692a0..658be1f8d 100644
--- a/src/sched/entry/ze/ze_reduce_local_entry.cpp
+++ b/src/sched/entry/ze/ze_reduce_local_entry.cpp
@@ -15,7 +15,7 @@
 */
 #include "sched/entry/ze/ze_reduce_local_entry.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 
 #include <string>
 
diff --git a/src/sched/sched.cpp b/src/sched/sched.cpp
index e087b143e..a0498493e 100644
--- a/src/sched/sched.cpp
+++ b/src/sched/sched.cpp
@@ -32,7 +32,7 @@
 #include "common/utils/sycl_utils.hpp"
 
 #ifdef CCL_ENABLE_ZE
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 #endif // CCL_ENABLE_ZE
@@ -131,11 +131,35 @@ void ccl_sched::commit(ccl_parallelizer* parallelizer, bool update_sched_id) {
 }
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-void ccl_sched::ze_commands_submit() {
+uint32_t ccl_sched::ze_commands_submit() {
+    uint32_t cmd_counter = 0;
     for (auto& entry : entries) {
-        entry->ze_commands_submit();
+        cmd_counter += entry->ze_commands_submit();
     }
+    return cmd_counter;
 };
+
+bool ccl_sched::get_ze_commands_bypass_flag() {
+    return is_ze_commands_bypass;
+}
+
+void ccl_sched::set_ze_commands_bypass_flag(bool bypass) {
+    if (subsched_entry_parent_sched) {
+        subsched_entry_parent_sched->set_ze_commands_bypass_flag(bypass);
+    }
+    if (parent_sched) {
+        parent_sched->set_ze_commands_bypass_flag(bypass);
+    }
+    is_ze_commands_bypass = bypass;
+}
+
+std::shared_ptr<sync_object>& ccl_sched::get_init_ze_hook_sync_obj() {
+    return init_ze_hook_sync_obj;
+}
+
+void ccl_sched::set_init_ze_hook_sync_obj(std::shared_ptr<sync_object> sync_obj) {
+    init_ze_hook_sync_obj = std::move(sync_obj);
+}
 #endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
 
 void ccl_sched::reset_state() {
@@ -190,8 +214,9 @@ ccl_request* ccl_sched::reset_request() {
 
 void ccl_sched::add_subsched(const ccl_coll_param& coll_param, bool update_sched_id) {
     ccl_sched_id_t param_sched_id =
-        update_sched_id ? coll_param.comm->get_sched_id(sched_type != ccl_sched_regular)
-                        : this->sched_id;
+        update_sched_id
+            ? coll_param.comm->get_sched_id(sched_type != ccl_sched_regular, coll_param.is_pt2pt)
+            : this->sched_id;
 
     ccl_sched_create_param param = { sched_type, param_sched_id, coll_param };
 
@@ -292,11 +317,25 @@ ccl_sched::ccl_sched_ptr ccl_sched::create(const ccl_coll_param& param, const cc
     ccl_sched_key key;
     ccl_sched_ptr sched;
     bool is_created = false;
-    auto create_fn = [param]() -> ccl_sched_ptr {
-        return new ccl_sched({ ccl_sched_regular, param.comm->get_sched_id(false), param },
-                             /* top-level sched */ true);
+    // WARNING be cautious while modifying the lambda-related code !
+    // `param` is captured by reference
+    // lifetimes:
+    //      - the lambda's lifetime ends at the end of `ccl_sched::create`
+    //      - `param`'s lifetime exceeds the end of `ccl_sched::create`
+    // `param` outlives the lambda, so the code is ok (there should be no memory issues)
+    // C++ does not check lifetimes, this has to be assured by a programmer !
+    // optionally, the code might be refactored in the future to use shared_ptr
+    auto create_fn = [&param]() -> ccl_sched_ptr {
+        return new ccl_sched(
+            { ccl_sched_regular, param.comm->get_sched_id(false, param.is_pt2pt), param },
+            /* top-level sched */ true);
     };
 
+#ifdef CCL_ENABLE_ITT
+    __itt_event sched_create_event = ccl::profile::itt::event_get("SCHED_CREATE");
+    ccl::profile::itt::event_start(sched_create_event);
+#endif // CCL_ENABLE_ITT
+
     if (attr.to_cache) {
         key.set(param, attr);
         std::tie(sched, is_created) =
@@ -323,6 +362,10 @@ ccl_sched::ccl_sched_ptr ccl_sched::create(const ccl_coll_param& param, const cc
             "found sched, reuse ", sched, ", type ", ccl_coll_type_to_str(sched->coll_param.ctype));
     }
 
+#ifdef CCL_ENABLE_ITT
+    ccl::profile::itt::event_end(sched_create_event);
+#endif // CCL_ENABLE_ITT
+
     return sched;
 }
 
@@ -427,7 +470,6 @@ void ccl_sched::complete() {
                 ccl_coll_param* profile_param = &(coll_param);
                 ss << ccl_coll_type_to_str(profile_param->ctype);
 
-                /* TODO: tmp check, replace ccl_coll_entry_param by ccl_coll_param */
                 if (!profile_param->send_counts.empty()) {
                     ss << " count:" << profile_param->get_send_count();
                 }
@@ -494,6 +536,12 @@ void ccl_sched::renew(bool need_update_id, bool reset) {
 
     if (reset)
         reset_state();
+
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    if (init_ze_hook_sync_obj && ze_entries.empty()) {
+        init_ze_hook_sync_obj->visit();
+    }
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 }
 
 void ccl_sched::add_barrier() {
@@ -583,27 +631,6 @@ void ccl_sched::try_to_restart() {
                 /* restart */ true);
 }
 
-void ccl_sched::release_sync_event(ccl_request* request) {
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-    if (use_output_event) {
-        // check if the event has been reset already(is_host is true for an empty one)
-        if (!request->has_sync_event()) {
-            LOG_DEBUG("request's event has been released already, skipping");
-        }
-        else {
-            auto& pools = ccl::global_data::get().ze_data->dynamic_event_pools;
-            auto pool_it = pools.find(coll_param.stream->get_ze_context());
-            CCL_THROW_IF_NOT(pool_it != pools.end(), "pool must be initialized for the context");
-
-            pool_it->second.put_event(ccl::utils::get_native_event(request->get_sync_event()));
-        }
-    }
-    else {
-        LOG_DEBUG("skip sync event destruction");
-    }
-#endif
-}
-
 void ccl_sched::update_active_request(bool use_delayed) {
     // at this point we reset the active request, but it still can
     // be referenced via an event, returned previously to the user.
@@ -614,15 +641,5 @@ void ccl_sched::update_active_request(bool use_delayed) {
 }
 
 void ccl_sched::complete_itt(const ccl_stream* stream) {
-#ifdef CCL_ENABLE_ITT
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-    // only applicable for device execution
-    if (stream) {
-        ccl::profile::itt::task_end(ccl::profile::itt::task_type::completion);
-    }
-#endif // CCL_ENABLE_SYCL
-    ccl::profile::itt::task_end(ccl::profile::itt::task_type::operation);
-#else
     (void)stream;
-#endif // CCL_ENABLE_ITT
 }
diff --git a/src/sched/sched.hpp b/src/sched/sched.hpp
index 50c67dc25..f8ff32050 100644
--- a/src/sched/sched.hpp
+++ b/src/sched/sched.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "common/request/request.hpp"
+#include "common/utils/sync_object.hpp"
 #include "sched/sched_base.hpp"
 #include "sched/sched_timer.hpp"
 #include "sched/queue/flow_control.hpp"
@@ -81,6 +82,16 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
                        // if true - we're restarting the same sched after it's been delayed
                        bool restart = false);
 
+#if defined(CCL_ENABLE_ZE) && defined(CCL_ENABLE_SYCL)
+    void inherit_ze_managers_from(ccl_sched* sched) {
+        CCL_THROW_IF_NOT(entries.empty());
+        CCL_THROW_IF_NOT(subscheds.empty());
+        CCL_THROW_IF_NOT(sched);
+
+        memory.list_manager = sched->memory.list_manager;
+    }
+#endif // CCL_ENABLE_ZE && CCL_ENABLE_SYCL
+
     /**
      * Reset completion counter of @b req
      * @return pointer to req that can be used to track completion
@@ -141,7 +152,6 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
 
     using ccl_sched_base::add_entry_front_t;
     using ccl_sched_base::add_entry_back_t;
-    using add_entry_default_t = add_entry_mode_t<ccl_sched_add_mode_last_value>;
 
     sched_entry* add_entry(std::unique_ptr<sched_entry>&& entry) {
         entry->set_exec_mode(exec_mode);
@@ -247,14 +257,18 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
     // cleanup structs related to the request from
     // the schedule
     bool release_request(ccl_request* req);
-    // release event associated with the request, this
-    // needs to be here because the pool is stored per
-    // schedule
-    void release_sync_event(ccl_request* req);
 
     void set_submitted_to_gpu(bool submitted_to_gpu);
     bool is_submitted_to_gpu();
 
+#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    bool get_ze_commands_bypass_flag();
+    void set_ze_commands_bypass_flag(bool bypass);
+
+    std::shared_ptr<sync_object>& get_init_ze_hook_sync_obj();
+    void set_init_ze_hook_sync_obj(std::shared_ptr<sync_object> sync_obj);
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+
 private:
     void set_output_event(ccl_request* request);
     void update_active_request(bool use_delayed);
@@ -266,10 +280,12 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
     ccl_request* req;
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-    void ze_commands_submit();
+    uint32_t ze_commands_submit();
+    bool is_ze_commands_bypass{ true };
+    std::shared_ptr<sync_object> init_ze_hook_sync_obj;
 
     const bool use_output_event = false;
-#endif
+#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     const bool top_level_sched;
 
     // pointer to the parent sched if this sched is part of a subsched_entry, nullptr otherwise
@@ -279,6 +295,7 @@ class alignas(CACHELINE_SIZE) ccl_sched : public ccl_sched_base {
     std::unique_ptr<sched_restart_manager> restart_manager;
 
     friend class sched_restart_manager;
-    friend class execute_cmdlists_entry; // need to call ze_commands_submit();
+    friend class ze_execute_cmdlists_on_init_entry; // need to call ze_commands_submit();
+    friend class ze_execute_cmdlists_on_start_entry; // need to call ze_commands_submit();
     friend class subsched_entry; // need to call ze_commands_submit();
 };
diff --git a/src/sched/sched_base.cpp b/src/sched/sched_base.cpp
index 1874e9c6c..61360cb4f 100644
--- a/src/sched/sched_base.cpp
+++ b/src/sched/sched_base.cpp
@@ -15,6 +15,7 @@
 */
 #include <numeric>
 
+#include "atl/mpi/atl_mpi_ctx.hpp"
 #include "coll/algorithms/algorithm_utils.hpp"
 #include "coll/coll_param.hpp"
 #include "coll/selection/selection.hpp"
@@ -36,8 +37,10 @@ ccl_sched_base::ccl_sched_base(const ccl_sched_create_param& param)
     if (coll_param.stream &&
         coll_param.stream->get_backend() == ccl::utils::get_level_zero_backend()) {
         memory.event_manager.reset(new ccl::ze::event_manager(coll_param.stream));
+
         auto node_comm = coll_param.comm->get_node_comm().get();
         memory.handle_manager.init(node_comm, coll_param.stream);
+
         memory.ipc_event_pool_manager.init(coll_param.stream);
         memory.list_manager.reset(new ccl::ze::list_manager(this, coll_param.stream));
     }
@@ -45,12 +48,13 @@ ccl_sched_base::ccl_sched_base(const ccl_sched_create_param& param)
 }
 
 std::string to_string(ccl_sched_add_mode mode) {
+    auto mode_str = "UNDEFINED";
     switch (mode) {
         case ccl_sched_add_front: return "FRONT";
         case ccl_sched_add_back: return "BACK";
-        default: return "DEFAULT";
+        default: mode_str = "DEFAULT";
     }
-    return "DEFAULT";
+    return mode_str;
 }
 
 ccl_sched_base::~ccl_sched_base() {
@@ -69,16 +73,15 @@ void ccl_sched_base::update_coll_param_and_attr(const ccl_coll_param& param,
 #endif // CCL_ENABLE_SYCL
 
     bool has_pre_post_copies =
-        (!coll_param.device_send_bufs.empty() || !coll_param.device_recv_bufs.empty()) ? true
-                                                                                       : false;
+        (!coll_param.send_dev_bufs.empty() || !coll_param.recv_dev_bufs.empty()) ? true : false;
 
     if (has_pre_post_copies) {
-        CCL_THROW_IF_NOT(coll_param.device_send_bufs.size() == param.send_bufs.size(),
+        CCL_THROW_IF_NOT(coll_param.send_dev_bufs.size() == param.send_bufs.size(),
                          "send_bufs sizes mismatch");
-        CCL_THROW_IF_NOT(coll_param.device_recv_bufs.size() == param.recv_bufs.size(),
+        CCL_THROW_IF_NOT(coll_param.recv_dev_bufs.size() == param.recv_bufs.size(),
                          "recv_bufs sizes mismatch");
-        coll_param.device_send_bufs = param.send_bufs;
-        coll_param.device_recv_bufs = param.recv_bufs;
+        coll_param.send_dev_bufs = param.send_bufs;
+        coll_param.recv_dev_bufs = param.recv_bufs;
     }
     else {
         CCL_THROW_IF_NOT(coll_param.send_bufs.size() == param.send_bufs.size(),
@@ -170,13 +173,12 @@ void ccl_sched_base::dealloc_buffer(const ccl::dealloc_param& user_param) {
 }
 
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-bool ccl_sched_base::try_enable_ze_single_list() {
+void ccl_sched_base::try_enable_ze_single_list() {
     CCL_THROW_IF_NOT(ze_entries.empty(),
                      "trying to modify the list mode after ze_entries has already been formed");
     use_single_list = ccl::global_data::env().enable_ze_single_list &&
                       ccl::global_data::env().kernel_debug == 0 &&
                       !ccl::global_data::env().enable_fusion;
-    return use_single_list;
 }
 
 void ccl_sched_base::append_to_ze_entries_list(sched_entry* entry) {
@@ -185,6 +187,26 @@ void ccl_sched_base::append_to_ze_entries_list(sched_entry* entry) {
     }
     ze_entries.push_back(entry);
 }
+
+bool ccl_sched_base::check_pt2pt_pre_post_copy_support(const ccl_coll_param& param,
+                                                       bool enable_pt2pt_offload) {
+    if ((param.ctype == ccl_coll_send || param.ctype == ccl_coll_recv) &&
+        (param.stream && param.stream->is_sycl_device_stream())) {
+        bool enable_hmem =
+            (ccl::global_data::env().use_hmem && atl_base_comm::attr.out.enable_hmem);
+        LOG_DEBUG("value of hmem is: ", enable_hmem);
+
+        if (enable_hmem) {
+            LOG_DEBUG("hmem is enabled, no need for pre/post copy");
+            return false;
+        }
+        else if (enable_pt2pt_offload) {
+            LOG_DEBUG("offload algo is selected for send-recv, no need for pre/post copy");
+            return false;
+        }
+    }
+    return true;
+}
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
 void ccl_sched_base::sched_complete_hook() {
@@ -243,7 +265,13 @@ void ccl_sched_base::clear_memory() {
         }
         memory.handle_manager.clear();
         memory.ipc_event_pool_manager.clear();
-        if (memory.list_manager) {
+
+        // Since list_manager is a shared_ptr, call clear only for the last
+        //  reference (when use_count() is 1).
+        // In all other cases, it is correct to simply skip calling clear since
+        //  other schedules may have inherited the same list_manager entry, and,
+        //  therefore, the same lists may still be in use.
+        if (memory.list_manager.use_count() == 1) {
             memory.list_manager->clear();
         }
     }
@@ -317,8 +345,9 @@ void ccl_sched_base::get_pre_post_copy_counts(std::vector<size_t>& d2h_counts,
                     h2d_counts.end(), param.recv_counts.begin(), param.recv_counts.end());
             }
             else {
-                h2d_counts.push_back(
-                    std::accumulate(param.recv_counts.begin(), param.recv_counts.end(), 0));
+                h2d_counts.push_back(std::accumulate(param.recv_counts.begin(),
+                                                     param.recv_counts.end(),
+                                                     ccl::utils::initial_count_value));
             }
             break;
         case ccl_coll_allreduce:
@@ -340,10 +369,12 @@ void ccl_sched_base::get_pre_post_copy_counts(std::vector<size_t>& d2h_counts,
                     h2d_counts.end(), param.recv_counts.begin(), param.recv_counts.end());
             }
             else {
-                d2h_counts.push_back(
-                    std::accumulate(param.send_counts.begin(), param.send_counts.end(), 0));
-                h2d_counts.push_back(
-                    std::accumulate(param.recv_counts.begin(), param.recv_counts.end(), 0));
+                d2h_counts.push_back(std::accumulate(param.send_counts.begin(),
+                                                     param.send_counts.end(),
+                                                     ccl::utils::initial_count_value));
+                h2d_counts.push_back(std::accumulate(param.recv_counts.begin(),
+                                                     param.recv_counts.end(),
+                                                     ccl::utils::initial_count_value));
             }
             break;
         case ccl_coll_bcast:
@@ -372,11 +403,14 @@ void ccl_sched_base::get_pre_post_copy_counts(std::vector<size_t>& d2h_counts,
 
 void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
 #ifdef CCL_ENABLE_SYCL
+    if (coll_param.ctype == ccl_coll_last_value) {
+        return;
+    }
 
     ccl_coll_param& param = coll_param;
 
-    param.device_send_bufs.clear();
-    param.device_recv_bufs.clear();
+    param.send_dev_bufs.clear();
+    param.recv_dev_bufs.clear();
 
     ccl_selector_param selector_param;
     selector_param.ctype = param.ctype;
@@ -385,22 +419,17 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
     selector_param.comm = param.comm;
     selector_param.stream = param.stream;
     selector_param.is_sycl_buf = coll_attr.is_sycl_buf;
+    selector_param.peer_rank = param.peer_rank;
     selector_param.recv_counts = param.recv_counts.data();
 
-    bool enable_hmem = (ccl::global_data::env().use_hmem && atl_base_comm::attr.out.enable_hmem);
-    LOG_DEBUG("value of hmem is = ", enable_hmem);
-
     if (!param.stream || !param.stream->is_sycl_device_stream() ||
         ccl_is_device_side_algo(selector_param)) {
         return;
     }
 
-    if ((param.ctype == ccl_coll_send || param.ctype == ccl_coll_recv) &&
-        (param.stream || param.stream->is_sycl_device_stream())) {
-        if (enable_hmem) {
-            LOG_DEBUG("hmem is enabled, no need for pre/post copy");
-            return;
-        }
+    if (!check_pt2pt_pre_post_copy_support(
+            param, (ccl_is_offload_pt2pt_algo(selector_param) && use_pt2pt_offload_algo()))) {
+        return;
     }
 
     bool should_alloc_buffers = true;
@@ -428,8 +457,8 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
         move user-supplied pointers into device_* fields
         they will be used further for pre-post copies
     */
-    param.device_send_bufs = param.send_bufs;
-    param.device_recv_bufs = param.recv_bufs;
+    param.send_dev_bufs = param.send_bufs;
+    param.recv_dev_bufs = param.recv_bufs;
 
     std::vector<size_t> d2h_counts;
     std::vector<size_t> h2d_counts;
@@ -484,23 +513,23 @@ void ccl_sched_base::alloc_buffers_for_pre_post_copy() {
         param.recv_bufs = param.send_bufs;
     }
 
-    CCL_THROW_IF_NOT(param.send_bufs.size() == param.device_send_bufs.size(),
+    CCL_THROW_IF_NOT(param.send_bufs.size() == param.send_dev_bufs.size(),
                      "send_bufs.size() mismatch: ",
                      param.send_bufs.size(),
                      " vs ",
-                     param.device_send_bufs.size());
+                     param.send_dev_bufs.size());
 
-    CCL_THROW_IF_NOT(param.recv_bufs.size() == param.device_recv_bufs.size(),
+    CCL_THROW_IF_NOT(param.recv_bufs.size() == param.recv_dev_bufs.size(),
                      "recv_bufs.size() mismatch: ",
                      param.recv_bufs.size(),
                      " vs ",
-                     param.device_recv_bufs.size());
+                     param.recv_dev_bufs.size());
 
 #endif // CCL_ENABLE_SYCL
 }
 
 void ccl_sched_base::update_id() {
-    sched_id = coll_param.comm->get_sched_id(sched_type != ccl_sched_regular);
+    sched_id = coll_param.comm->get_sched_id(sched_type != ccl_sched_regular, coll_param.is_pt2pt);
 }
 
 void ccl_sched_base::dump(std::ostream& out, const char* name) const {
diff --git a/src/sched/sched_base.hpp b/src/sched/sched_base.hpp
index 56bff73f1..4201273c8 100644
--- a/src/sched/sched_base.hpp
+++ b/src/sched/sched_base.hpp
@@ -58,7 +58,7 @@ struct ccl_sched_memory {
     std::unique_ptr<ccl::ze::event_manager> event_manager;
     ccl::ze::ipc_handle_manager handle_manager;
     ccl::ze::ipc_event_pool_manager ipc_event_pool_manager;
-    std::unique_ptr<ccl::ze::list_manager> list_manager;
+    std::shared_ptr<ccl::ze::list_manager> list_manager;
 #endif // CCL_ENABLE_ZE
 
     std::list<atl_mr_t*> mr_list;
@@ -69,22 +69,22 @@ struct ccl_sched_create_param {
     ccl_sched_id_t id;
     ccl_coll_param coll_param;
 
-    ccl_sched_create_param(ccl_sched_type type, ccl_sched_id_t id, ccl_coll_param coll_param)
+    ccl_sched_create_param(ccl_sched_type type, ccl_sched_id_t id, const ccl_coll_param& coll_param)
             : type(type),
               id(id),
               coll_param(coll_param) {}
 
-    ccl_sched_create_param(ccl_sched_type type, ccl_coll_param coll_param)
+    ccl_sched_create_param(ccl_sched_type type, const ccl_coll_param& coll_param)
             : ccl_sched_create_param(type, 0, coll_param) {}
 
-    ccl_sched_create_param(ccl_sched_id_t id, ccl_coll_param coll_param)
+    ccl_sched_create_param(ccl_sched_id_t id, const ccl_coll_param& coll_param)
             : ccl_sched_create_param(ccl_sched_regular, id, coll_param) {}
 };
 
 static size_t lifo_priority = 0;
 
 struct ccl_sched_base {
-    template <ccl_sched_add_mode mode = ccl_sched_add_mode_last_value>
+    template <ccl_sched_add_mode mode>
     using add_entry_mode_t = std::integral_constant<ccl_sched_add_mode, mode>;
 
     using add_entry_front_t = add_entry_mode_t<ccl_sched_add_front>;
@@ -137,8 +137,9 @@ struct ccl_sched_base {
     std::vector<sched_entry*> ze_entries;
     bool use_single_list{};
 
-    bool try_enable_ze_single_list();
+    void try_enable_ze_single_list();
     void append_to_ze_entries_list(sched_entry* entry);
+    bool check_pt2pt_pre_post_copy_support(const ccl_coll_param& param, bool enable_pt2pt_offload);
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 
     ccl_sched_type sched_type = ccl_sched_regular;
diff --git a/src/sched/sched_timer.cpp b/src/sched/sched_timer.cpp
index 8043bc45f..cff2d8af6 100644
--- a/src/sched/sched_timer.cpp
+++ b/src/sched/sched_timer.cpp
@@ -16,6 +16,7 @@
 #include <iomanip>
 #include <numeric>
 #include <sstream>
+#include <unordered_map>
 
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
@@ -23,6 +24,8 @@
 
 #ifdef CCL_ENABLE_ITT
 #include "ittnotify.h"
+#include <map>
+#include <stack>
 #endif // CCL_ENABLE_ITT
 
 namespace ccl {
@@ -65,78 +68,95 @@ std::string to_string(const sched_timer& timer) {
 namespace profile {
 namespace itt {
 
-static __itt_domain* get_domain() {
-    static __itt_domain* domain = __itt_domain_create("oneCCL");
-    return domain;
-}
+static constexpr unsigned max_entry_name_length = 64;
+// Map of vectors of events that allows us to avoid multiple
+// expensive calls to `__itt_event_create`.
+thread_local std::unordered_map<const char*, std::vector<__itt_event>> event_cache;
+// Inflight events are events fetched from cache that were not returned yet.
+// This structure allows us to easily return finished event to cache vector
+// it belongs to.
+thread_local std::unordered_map<__itt_event, std::vector<__itt_event>*> inflight_event_cache;
+thread_local std::unordered_map<__itt_event, unsigned> inflight_event_ref_counts;
 
-static __itt_string_handle* get_operation_handle() {
-    static __itt_string_handle* handle = __itt_string_handle_create("ccl_operation");
-    return handle;
+void set_thread_name(const std::string& name) {
+    __itt_thread_set_name(name.c_str());
 }
 
-static __itt_string_handle* get_api_call_handle() {
-    static __itt_string_handle* handle = __itt_string_handle_create("ccl_api_call");
-    return handle;
-}
+__itt_event event_get(const char* name) {
+    if (ccl::global_data::env().itt_level == 0) {
+        return invalid_event;
+    }
 
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
+    __itt_event event;
 
-static __itt_string_handle* get_preparation_handle() {
-    static __itt_string_handle* handle = __itt_string_handle_create("ccl_init");
-    return handle;
-}
+    auto cache_entry = event_cache.find(name);
 
-static __itt_string_handle* get_device_work_handle() {
-    static __itt_string_handle* handle = __itt_string_handle_create("ccl_submit_and_execute");
-    return handle;
-}
+    if (cache_entry == event_cache.end()) {
+        // Initialize vector of __itt_events
+        event_cache[name];
+        cache_entry = event_cache.find(name);
+    }
 
-static __itt_string_handle* get_deps_handling_handle() {
-    static __itt_string_handle* handle = __itt_string_handle_create("ccl_deps_handling");
-    return handle;
-}
+    // Entry in event_cache is initialized, we
+    // can fetch vector with specific event type
+    auto cached_vector = &cache_entry->second;
 
-static __itt_string_handle* get_completion_handle() {
-    static __itt_string_handle* handle = __itt_string_handle_create("ccl_finalize");
-    return handle;
-}
+    if (!cached_vector->empty()) {
+        // There is cached __itt_event handle
+        // that can be used once again
+        event = cached_vector->back();
+        cached_vector->pop_back();
+    }
+    else {
+        // No cached events
+        char prefix_name[max_entry_name_length] = "oneCCL::";
+        strncat(prefix_name, name, max_entry_name_length - strlen(prefix_name));
+        event = __itt_event_create(prefix_name, strlen(prefix_name));
+    }
 
-#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
+    // Record cache vector to which the event should be
+    // returned on event_end
+    inflight_event_cache[event] = cached_vector;
 
-void set_thread_name(const std::string& name) {
-    __itt_thread_set_name(name.c_str());
-}
+    auto event_ref_count = inflight_event_ref_counts.find(event);
 
-static __itt_string_handle* get_handle_for_type(task_type type) {
-    switch (type) {
-        case task_type::operation: return get_operation_handle();
-        case task_type::api_call: return get_api_call_handle();
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-        case task_type::preparation: return get_preparation_handle();
-        case task_type::device_work: return get_device_work_handle();
-        case task_type::deps_handling: return get_deps_handling_handle();
-        case task_type::completion: return get_completion_handle();
-#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
-        default: CCL_THROW("invalid task type: ", (int)type);
+    if (event_ref_count == inflight_event_ref_counts.end()) {
+        // The event handle is not in use by any entry
+        inflight_event_ref_counts[event] = 1;
+    }
+    else {
+        event_ref_count->second++;
     }
+
+    return event;
 }
 
-void task_start(task_type type) {
-    if (ccl::global_data::env().itt_level == 0)
+void event_start(__itt_event event) {
+    if (ccl::global_data::env().itt_level == 0) {
         return;
+    }
 
-    auto* handle = get_handle_for_type(type);
-    __itt_task_begin(get_domain(), __itt_null, __itt_null, handle);
+    __itt_event_start(event);
 }
 
-void task_end(task_type type) {
-    if (ccl::global_data::env().itt_level == 0)
+void event_end(__itt_event event) {
+    if (ccl::global_data::env().itt_level == 0) {
         return;
+    }
+
+    __itt_event_end(event);
+    inflight_event_cache[event]->push_back(event);
 
-    // ignore for now, use only to identify the task that's being ended
-    (void)type;
-    __itt_task_end(get_domain());
+    auto event_ref_count = inflight_event_ref_counts.find(event);
+
+    event_ref_count->second--;
+    if (event_ref_count->second == 0) {
+        // No more references to the event are currently used
+        // which means that we can remove the event from
+        // 'inflight' cache.
+        inflight_event_cache.erase(event);
+        inflight_event_ref_counts.erase(event);
+    }
 }
 
 } // namespace itt
diff --git a/src/sched/sched_timer.hpp b/src/sched/sched_timer.hpp
index aae3c37ac..6c4a3ea7e 100644
--- a/src/sched/sched_timer.hpp
+++ b/src/sched/sched_timer.hpp
@@ -18,6 +18,11 @@
 #include <chrono>
 #include <string>
 
+#ifdef CCL_ENABLE_ITT
+#include "ittnotify.h"
+#include "coll/algorithms/algorithm_utils.hpp"
+#endif // CCL_ENABLE_ITT
+
 #if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
 #include "common/api_wrapper/ze_api_wrapper.hpp"
 #endif
@@ -49,19 +54,11 @@ namespace itt {
 
 void set_thread_name(const std::string& name);
 
-enum class task_type : int {
-    operation = 0,
-    api_call = 1,
-#if defined(CCL_ENABLE_SYCL) && defined(CCL_ENABLE_ZE)
-    preparation = 2,
-    device_work = 3,
-    deps_handling = 4,
-    completion = 5
-#endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
-};
+static constexpr __itt_event invalid_event = -1;
 
-void task_start(task_type type);
-void task_end(task_type type);
+__itt_event event_get(const char* name);
+void event_start(__itt_event event);
+void event_end(__itt_event event);
 
 } // namespace itt
 } // namespace profile
diff --git a/src/sched/ze/ze_event_manager.cpp b/src/sched/ze/ze_event_manager.cpp
index 1f28238dc..3bafe076c 100644
--- a/src/sched/ze/ze_event_manager.cpp
+++ b/src/sched/ze/ze_event_manager.cpp
@@ -18,7 +18,7 @@
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
 #include "common/utils/sycl_utils.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/ze/ze_event_manager.hpp"
 
 using namespace ccl;
@@ -174,11 +174,10 @@ dynamic_event_pool::dynamic_event_pool(const ccl_stream* stream) {
 dynamic_event_pool::~dynamic_event_pool() {
     // we expect that all events are released by the callee, at this point there
     // must be no allocated events, otherwise this indicates an error in the event handling
-    // TODO: return warnings
-    // if (!event_alloc_info.empty())
-    //     LOG_ERROR("all events are expected to be released");
-    // if (!event_pools.empty())
-    //     LOG_ERROR("all event pools are expected to be released");
+    if (!event_alloc_info.empty())
+        LOG_ERROR("all events are expected to be released");
+    if (!event_pools.empty())
+        LOG_ERROR("all event pools are expected to be released");
 }
 
 ze_event_pool_desc_t dynamic_event_pool::get_default_event_pool_desc() {
@@ -209,7 +208,8 @@ ze_event_handle_t dynamic_event_pool::get_event() {
     pool_info.num_alloc_events = 0;
 
     slot.pool = event_pools.insert(event_pools.end(), pool_info);
-    slot.pool_idx = 0;
+    slot.pool_idx = event_pool_request_idx;
+    event_pool_request_idx = ++event_pool_request_idx % event_pool_size;
 
     return create_event(slot);
 }
diff --git a/src/sched/ze/ze_event_manager.hpp b/src/sched/ze/ze_event_manager.hpp
index 68d7352d6..b380dcb62 100644
--- a/src/sched/ze/ze_event_manager.hpp
+++ b/src/sched/ze/ze_event_manager.hpp
@@ -139,7 +139,8 @@ class dynamic_event_pool {
 
     // TODO: make some parameters configurable
     // TODO: check if another value would be better, as this one is chosen quite arbitrary
-    static constexpr size_t event_pool_size{ 50 };
+    static constexpr size_t event_pool_size{ 1000 };
+    int event_pool_request_idx{ 0 };
 
     static ze_event_pool_desc_t get_default_event_pool_desc();
     static const ze_event_pool_desc_t common_pool_desc;
diff --git a/src/sched/ze/ze_handle_manager.cpp b/src/sched/ze/ze_handle_manager.cpp
index 8720b1292..df057a7bf 100644
--- a/src/sched/ze/ze_handle_manager.cpp
+++ b/src/sched/ze/ze_handle_manager.cpp
@@ -173,18 +173,32 @@ void ipc_handle_manager::clear() {
     cached_handles.clear();
 }
 
-void ipc_handle_manager::set(const mem_handle_map_t& handles_arg) {
+void ipc_handle_manager::set(const mem_handle_map_t& handles_arg, bool pt2pt_op) {
     CCL_THROW_IF_NOT(!handles_arg.empty(), "handles_arg argument is empty");
-    CCL_THROW_IF_NOT(handles_arg.size() == static_cast<size_t>(comm->size()),
-                     "handles_arg and comm sizes should be equal");
+    if (pt2pt_op) {
+        CCL_THROW_IF_NOT(handles_arg.size() == pt2pt_handles_size,
+                         "handles_arg (",
+                         handles_arg.size(),
+                         ") and handle_pt2pt_size (",
+                         pt2pt_handles_size,
+                         "), but it must be equal");
+    }
+    else {
+        CCL_THROW_IF_NOT(handles_arg.size() == static_cast<size_t>(comm->size()),
+                         "handles_arg and comm sizes should be equal");
+    }
+
     CCL_THROW_IF_NOT(handles.empty(), "handles should be empty before set");
 
     handles = handles_arg;
     LOG_DEBUG("handles are set successfully, size of handles: ", handles.size());
 }
 
-void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, const ccl_comm* map_comm) {
-    check_rank(rank, (map_comm) ? map_comm : comm);
+void* ipc_handle_manager::get_ptr(int rank,
+                                  size_t buf_idx,
+                                  const ccl_comm* map_comm,
+                                  bool pt2pt_op) {
+    check_rank(rank, (map_comm) ? map_comm : comm, pt2pt_op);
     if (map_comm && (map_comm->id() != comm->id())) {
         int old_rank = rank;
         rank = map_comm->get_global_rank(rank);
@@ -205,7 +219,7 @@ void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, const ccl_comm* map_
                   comm->id(),
                   ", size: ",
                   comm->size());
-        check_rank(rank, comm);
+        check_rank(rank, comm, pt2pt_op);
     }
     CCL_THROW_IF_NOT(buf_idx < handles[rank].size(), "buf_idx is not valid value: ", buf_idx);
 
@@ -248,15 +262,20 @@ void* ipc_handle_manager::get_ptr(int rank, size_t buf_idx, const ccl_comm* map_
     return static_cast<void*>(static_cast<char*>(mem_ptr) + handle_info.mem_offset);
 }
 
-void ipc_handle_manager::get(int rank, size_t buf_idx, ccl_buffer& buf, const ccl_comm* map_comm) {
-    buf.set(get_ptr(rank, buf_idx, map_comm));
+void ipc_handle_manager::get(int rank,
+                             size_t buf_idx,
+                             ccl_buffer& buf,
+                             const ccl_comm* map_comm,
+                             bool pt2pt_op) {
+    buf.set(get_ptr(rank, buf_idx, map_comm, pt2pt_op));
 }
 
 void ipc_handle_manager::get(int rank,
                              size_t buf_idx,
                              ze_event_pool_handle_t& buf,
-                             const ccl_comm* map_comm) {
-    buf = (ze_event_pool_handle_t)get_ptr(rank, buf_idx, map_comm);
+                             const ccl_comm* map_comm,
+                             bool pt2pt_op) {
+    buf = (ze_event_pool_handle_t)get_ptr(rank, buf_idx, map_comm, pt2pt_op);
 }
 
 void ipc_handle_manager::get_handle(void* ptr, ze_ipc_mem_handle_t* ipc_handle) {
@@ -311,17 +330,24 @@ void ipc_handle_manager::get_address_range(const void* ptr, void** base_ptr, siz
               *size);
 }
 
-void ipc_handle_manager::check_rank(int rank, const ccl_comm* check_comm) {
-    CCL_THROW_IF_NOT(
-        (rank >= 0) && (rank < static_cast<int>(handles.size())) && (rank < check_comm->size()),
-        "invalid rank: ",
-        rank,
-        ", handles.size: ",
-        handles.size(),
-        ", comm.size: ",
-        check_comm->size());
-    CCL_THROW_IF_NOT(
-        rank != check_comm->rank(), "do not expect to open ipc_handle for own rank: ", rank);
+void ipc_handle_manager::check_rank(int rank, const ccl_comm* check_comm, bool pt2pt_op) {
+    if (pt2pt_op) {
+        CCL_THROW_IF_NOT((rank == 0) && (rank < static_cast<int>(handles.size())),
+                         "expect 0 handle idx (rank) to get ptr for pt2pt_op: rank: ",
+                         rank);
+    }
+    else {
+        CCL_THROW_IF_NOT(
+            (rank >= 0) && (rank < static_cast<int>(handles.size())) && (rank < check_comm->size()),
+            "invalid rank: ",
+            rank,
+            ", handles.size: ",
+            handles.size(),
+            ", comm.size: ",
+            check_comm->size());
+        CCL_THROW_IF_NOT(
+            rank != check_comm->rank(), "do not expect to open ipc_handle for own rank: ", rank);
+    }
 }
 
 } // namespace ze
diff --git a/src/sched/ze/ze_handle_manager.hpp b/src/sched/ze/ze_handle_manager.hpp
index 4534cbedd..2136b965e 100644
--- a/src/sched/ze/ze_handle_manager.hpp
+++ b/src/sched/ze/ze_handle_manager.hpp
@@ -19,7 +19,7 @@
 #include "common/stream/stream.hpp"
 #include "common/utils/buffer.hpp"
 #include "common/utils/utils.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/entry/ze/ze_primitives.hpp"
 
 #include <unordered_map>
@@ -68,6 +68,11 @@ struct ipc_handle_desc {
 
 class ipc_handle_manager {
 public:
+    // for pt2pt ops, it's assumed to have only 1 buffer for communication
+    // it means, only one handle is expected for each rank which is participated
+    // in pt2pt communication
+    static constexpr int pt2pt_handles_size = 1;
+
     // matrix with ipc handles, row - rank, column - buf_idx
     using mem_handle_map_t = typename std::vector<std::vector<ipc_handle_desc>>;
 
@@ -79,11 +84,19 @@ class ipc_handle_manager {
     void init(const ccl_comm* comm, const ccl_stream* stream);
     void clear();
 
-    void set(const mem_handle_map_t& handles_arg);
+    void set(const mem_handle_map_t& handles_arg, bool pt2pt_op = false);
 
-    void* get_ptr(int rank, size_t buf_idx, const ccl_comm* map_comm);
-    void get(int rank, size_t buf_idx, ccl_buffer& buf, const ccl_comm* map_comm = nullptr);
-    void get(int rank, size_t buf_idx, ze_event_pool_handle_t& buf, const ccl_comm* map_comm);
+    void* get_ptr(int rank, size_t buf_idx, const ccl_comm* map_comm, bool pt2pt_op = false);
+    void get(int rank,
+             size_t buf_idx,
+             ccl_buffer& buf,
+             const ccl_comm* map_comm = nullptr,
+             bool pt2pt_op = false);
+    void get(int rank,
+             size_t buf_idx,
+             ze_event_pool_handle_t& buf,
+             const ccl_comm* map_comm,
+             bool pt2pt_op = false);
 
     void get_handle(void* ptr, ze_ipc_mem_handle_t* ipc_handle);
     void get_handle(ze_event_pool_handle_t pool, ze_ipc_event_pool_handle_t* ipc_handle);
@@ -109,7 +122,7 @@ class ipc_handle_manager {
      */
     std::list<mem_handle_cache::value_t> cached_handles;
 
-    void check_rank(int rank, const ccl_comm* check_comm);
+    void check_rank(int rank, const ccl_comm* check_comm, bool pt2pt_op);
 };
 
 } // namespace ze
diff --git a/src/sched/ze/ze_ipc_event_pool_manager.cpp b/src/sched/ze/ze_ipc_event_pool_manager.cpp
index 15b2d1d60..3f97bfefb 100644
--- a/src/sched/ze/ze_ipc_event_pool_manager.cpp
+++ b/src/sched/ze/ze_ipc_event_pool_manager.cpp
@@ -15,7 +15,7 @@
 */
 #include "common/global/global.hpp"
 #include "common/log/log.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/ze/ze_ipc_event_pool_manager.hpp"
 
 using namespace ccl;
diff --git a/src/sched/ze/ze_list_manager.cpp b/src/sched/ze/ze_list_manager.cpp
index 6818dce53..a2e2f8bc3 100644
--- a/src/sched/ze/ze_list_manager.cpp
+++ b/src/sched/ze/ze_list_manager.cpp
@@ -14,7 +14,7 @@
  limitations under the License.
 */
 #include "common/global/global.hpp"
-#include "sched/entry/ze/ze_cache.hpp"
+#include "sched/entry/ze/cache/ze_cache.hpp"
 #include "sched/ze/ze_list_manager.hpp"
 
 using namespace ccl;
@@ -28,10 +28,6 @@ ze_command_list_handle_t* list_info::get_native_ptr() {
     return &list;
 }
 
-const ze_command_list_desc_t& list_info::get_desc() const {
-    return desc;
-}
-
 bool list_info::is_valid() const {
     return list != nullptr;
 }
@@ -179,10 +175,6 @@ void queue_factory::clear() {
     queues.clear();
 }
 
-bool queue_factory::is_copy() const {
-    return is_copy_queue;
-}
-
 uint32_t queue_factory::get_ordinal() const {
     return queue_ordinal;
 }
@@ -261,10 +253,6 @@ const char* list_factory::get_type_str() const {
     return (is_copy_list) ? "copy" : "comp";
 }
 
-bool list_factory::is_copy() const {
-    return is_copy_list;
-}
-
 list_manager::list_manager(const ccl_sched_base* sched, const ccl_stream* stream)
         : sched(sched),
           device(stream->get_ze_device()),
@@ -496,14 +484,6 @@ void list_manager::reset_execution_state() {
     }
 }
 
-bool list_manager::can_use_copy_queue() const {
-    return use_copy_queue;
-}
-
-bool list_manager::can_use_main_queue() const {
-    return main_queue_available;
-}
-
 bool list_manager::is_executed() const {
     return executed;
 }
diff --git a/src/sched/ze/ze_list_manager.hpp b/src/sched/ze/ze_list_manager.hpp
index d8d48a8e5..a7e8baa6b 100644
--- a/src/sched/ze/ze_list_manager.hpp
+++ b/src/sched/ze/ze_list_manager.hpp
@@ -71,7 +71,7 @@ class queue_info {
     ze_command_queue_handle_t queue{};
     ze_command_queue_desc_t desc{};
     bool is_copy_queue{};
-    queue_group_type type;
+    queue_group_type type{};
 };
 
 using queue_info_t = typename std::shared_ptr<queue_info>;
@@ -85,7 +85,6 @@ class queue_factory {
     queue_info_t get(uint32_t index);
     void clear();
 
-    bool is_copy() const;
     uint32_t get_ordinal() const;
 
     static bool can_use_queue_group(ze_device_handle_t device,
@@ -118,8 +117,6 @@ class list_factory {
     list_info_t get(const queue_info_t& queue);
     void destroy(list_info_t& list);
 
-    bool is_copy() const;
-
 private:
     const ze_device_handle_t device;
     const ze_context_handle_t context;
@@ -151,8 +148,6 @@ class list_manager {
     void clear();
     void reset_execution_state();
 
-    bool can_use_copy_queue() const;
-    bool can_use_main_queue() const;
     bool is_executed() const;
 
 private:
diff --git a/src/stub_kvs_impl.cpp b/src/stub_kvs_impl.cpp
index 2afea0c9d..d2183dbab 100644
--- a/src/stub_kvs_impl.cpp
+++ b/src/stub_kvs_impl.cpp
@@ -29,9 +29,8 @@ static int get_unique_id() {
 }
 
 static kvs::address_type convert_id_to_addr(int id) {
-    kvs::address_type addr;
+    kvs::address_type addr{ 0 };
 
-    memset(addr.data(), 0, sizeof(addr));
     memcpy(addr.data(), &id, sizeof(id));
 
     return addr;
diff --git a/src/topology/topo_manager.cpp b/src/topology/topo_manager.cpp
index 76a1d4af4..0f2bdb0b2 100644
--- a/src/topology/topo_manager.cpp
+++ b/src/topology/topo_manager.cpp
@@ -119,9 +119,13 @@ bool topo_manager::has_oversubscription() const {
     return is_oversubscription_detected;
 }
 
+size_t topo_manager::get_unique_device_uuids_count() const {
+    return unique_device_uuids_count;
+}
+
 bool topo_manager::oversubscription_detected(const ze_rank_info_vec_t& ze_rank_infos,
                                              const host_info_vec_t& host_infos) {
-    size_t unique_device_uuids_count = topo_manager::invalid_device_uuids_count;
+    size_t unique_dev_uuids_count = topo_manager::invalid_device_uuids_count;
     for (const auto& host_info : host_infos) {
         std::vector<ze_device_uuid_t> unique_device_uuids;
         for (auto rank : host_info.ranks) {
@@ -130,11 +134,14 @@ bool topo_manager::oversubscription_detected(const ze_rank_info_vec_t& ze_rank_i
                 unique_device_uuids.push_back(rank_info.device_uuid);
             }
         }
-        unique_device_uuids_count += unique_device_uuids.size();
+        unique_dev_uuids_count += unique_device_uuids.size();
     }
 
-    CCL_THROW_IF_NOT(unique_device_uuids_count != topo_manager::invalid_device_uuids_count,
-                     "invalid unique_device_uuids_count");
+    CCL_THROW_IF_NOT(unique_dev_uuids_count != topo_manager::invalid_device_uuids_count,
+                     "invalid unique_dev_uuids_count");
+
+    unique_device_uuids_count = unique_dev_uuids_count;
+
     if (unique_device_uuids_count < rank_info_vec.size()) {
         LOG_DEBUG("unique_device_uuids_count: ",
                   unique_device_uuids_count,
@@ -824,8 +831,8 @@ fabric_ports_t topo_manager::get_fabric_ports() {
     std::vector<zes_fabric_port_handle_t> ports(port_count);
     ZE_CALL(zesDeviceEnumFabricPorts, ((zes_device_handle_t)ze_device, &port_count, ports.data()));
 
-    bool use_all_ports =
-        (ccl::ze::get_device_family(ze_device) == ccl::device_family::family2) ? true : false;
+    bool use_all_ports = (ccl::ze::get_device_family(ze_device) == ccl::device_family::family2 ||
+                          ccl::ze::get_device_family(ze_device) == ccl::device_family::family3);
     char* use_all_ports_env = getenv("CCL_TOPO_ALL_PORTS");
     if (use_all_ports_env) {
         use_all_ports = atoi(use_all_ports_env);
@@ -882,7 +889,8 @@ fabric_ports_t topo_manager::get_fabric_ports() {
 
     utils::allgather(comm, &my_port_count, all_port_counts.data(), sizeof(my_port_count));
 
-    size_t total_port_count = std::accumulate(all_port_counts.begin(), all_port_counts.end(), 0);
+    size_t total_port_count =
+        std::accumulate(all_port_counts.begin(), all_port_counts.end(), size_t(0));
 
     if (total_port_count == 0) {
         LOG_DEBUG("no ports detected");
@@ -1098,12 +1106,23 @@ void topo_manager::detect_tune_port_count(const std::vector<ze::device_info>& de
             }
         }
 
-        if (first_dev_port_count == topo_manager::tune_port_count) {
-            global_data::env().reduce_scatter_topo_read = 0;
+        if (first_dev_port_count == topo_manager::type1_tune_port_count ||
+            first_dev_port_count == topo_manager::type2_tune_port_count) {
             global_data::env().allgatherv_topo_read = 0;
-            global_data::env().alltoallv_topo_read = 0;
-            global_data::env().alltoallv_monolithic_read_kernel = 0;
-            LOG_INFO("12 ports system is detected, write mode is set");
+            if (first_dev_port_count == topo_manager::type1_tune_port_count) {
+                global_data::env().alltoallv_topo_read = 0;
+                global_data::env().alltoallv_monolithic_kernel = 0;
+            }
+            global_data::env().reduce_scatter_topo_read = 0;
+            global_data::env().ze_pt2pt_read = 0;
+
+            if (global_data::env().type2_mode != type2_tune_mode::off &&
+                first_dev_port_count == topo_manager::type2_tune_port_count) {
+                global_data::env().type2_mode = type2_tune_mode::detected;
+                LOG_DEBUG(
+                    "system with :", first_dev_port_count, " ports is detected, write mode is set");
+            }
+            LOG_INFO(first_dev_port_count, " ports system is detected, write mode is set");
         }
     }
     else {
diff --git a/src/topology/topo_manager.hpp b/src/topology/topo_manager.hpp
index c7a22b6dd..9344f9981 100644
--- a/src/topology/topo_manager.hpp
+++ b/src/topology/topo_manager.hpp
@@ -30,6 +30,13 @@
 #include "common/utils/exchange_utils.hpp"
 
 namespace ccl {
+enum class type2_tune_mode : int { undetected, detected, on, off };
+static std::map<type2_tune_mode, std::string> type2_tune_mode_names = {
+    std::make_pair(type2_tune_mode::undetected, "undetected"),
+    std::make_pair(type2_tune_mode::detected, "detected"),
+    std::make_pair(type2_tune_mode::on, "on"),
+    std::make_pair(type2_tune_mode::off, "off")
+};
 
 enum class topo_color_mode : int { fixed, ze, env };
 static std::map<topo_color_mode, std::string> topo_color_names = {
@@ -41,10 +48,10 @@ static std::map<topo_color_mode, std::string> topo_color_names = {
 static constexpr int topo_uuid_len = 35;
 
 struct topo_rank_info {
-    int rank;
-    int host_idx;
-    int local_proc_idx;
-    char uuid[topo_uuid_len];
+    int rank{ ccl::utils::invalid_rank };
+    int host_idx{ ccl::utils::invalid_host_idx };
+    int local_proc_idx{ ccl::utils::invalid_rank };
+    char uuid[topo_uuid_len]{};
 
     topo_rank_info();
 };
@@ -113,10 +120,13 @@ class topo_manager {
     static constexpr int max_ranks_per_plane = 8;
     static constexpr int max_domain_count = 2;
     static constexpr size_t invalid_device_uuids_count = 0;
-    // to determine the florence system, 12 number of ports
-    // is used: LRZ has 3 pairs and there're 2 links between pairs,
+    // to determine the type1 system, 12 number of ports
+    // is used: type1 has 3 pairs and there're 2 links between pairs,
     // so 3 * 2 * 2 = 12 ports
-    static constexpr uint32_t tune_port_count = 12;
+    static constexpr uint32_t type1_tune_port_count = 12;
+    // to determine type2 system, 6 number of ports
+    // is used: type2 is one tile host: 3 * 2 * 1 = 6
+    static constexpr uint32_t type2_tune_port_count = 6;
 
     static constexpr int card_domain_idx = 0;
     static constexpr int plane_domain_idx = 1;
@@ -154,6 +164,7 @@ class topo_manager {
 
     bool has_oversubscription() const;
     bool is_oversubscription_detected = false;
+    size_t get_unique_device_uuids_count() const;
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
     rank_info_vec_t get_filtered_rank_info_vec(int filter_host_idx) const;
 
@@ -232,6 +243,7 @@ class topo_manager {
 
     bool is_p2p_access_enabled = false;
     port_health_status port_status = port_health_status::unknown;
+    size_t unique_device_uuids_count = topo_manager::invalid_device_uuids_count;
 #endif // CCL_ENABLE_SYCL && CCL_ENABLE_ZE
 };
 
diff --git a/src/unordered_coll/unordered_coll.cpp b/src/unordered_coll/unordered_coll.cpp
index dadec45bc..ffe879e7f 100644
--- a/src/unordered_coll/unordered_coll.cpp
+++ b/src/unordered_coll/unordered_coll.cpp
@@ -161,9 +161,11 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     coll_param.comm = coordination_comm.get();
 
     ccl_sched* null_sched = nullptr;
-    std::unique_ptr<ccl_sched> service_sched(new ccl_sched(
-        { ccl_sched_unordered_coll, coordination_comm->get_sched_id(true), coll_param },
-        null_sched));
+    std::unique_ptr<ccl_sched> service_sched(
+        new ccl_sched({ ccl_sched_unordered_coll,
+                        coordination_comm->get_sched_id(true, coll_param.is_pt2pt),
+                        coll_param },
+                      null_sched));
 
     if (ccl::global_data::env().priority_mode == ccl_priority_lifo) {
         service_sched->coll_attr.priority = ccl_sched_base::get_lifo_priority();
@@ -197,7 +199,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
                   ctx->match_id_size);
     }
 
-    ccl_coll_entry_param match_id_size_param{};
+    ccl_coll_param match_id_size_param{};
     match_id_size_param.ctype = ccl_coll_bcast;
     match_id_size_param.recv_buf = ccl_buffer(&ctx->match_id_size, sizeof(size_t));
     match_id_size_param.count = sizeof(size_t);
@@ -209,7 +211,7 @@ void ccl_unordered_coll_manager::start_coordination(const std::string& match_id)
     service_sched->add_barrier();
 
     /* 2. broadcast match_id_value */
-    ccl_coll_entry_param match_id_val_param{};
+    ccl_coll_param match_id_val_param{};
     match_id_val_param.ctype = ccl_coll_bcast;
     match_id_val_param.recv_buf = ccl_buffer();
     match_id_val_param.count = 0;
diff --git a/tests/functional/test_impl.hpp b/tests/functional/test_impl.hpp
index b6bfdf721..be4291f7a 100644
--- a/tests/functional/test_impl.hpp
+++ b/tests/functional/test_impl.hpp
@@ -186,7 +186,8 @@ int base_test<T>::check_error(test_operation<T>& op, T expected, size_t buf_idx,
         }
     }
 
-    if (fabs(max_error) < fabs((double)expected - (double)op.recv_bufs[buf_idx][elem_idx])) {
+    if (std::fabs(max_error) <
+        std::fabs((double)expected - (double)op.recv_bufs[buf_idx][elem_idx])) {
         printf("[%d] got op.recvBuf[%zu][%zu] = %0.7f, but expected = %0.7f, "
                "max_error = %0.10f, precision = %0.7f\n",
                op.comm_rank,
diff --git a/third-party-programs.txt b/third-party-programs.txt
index 95e591d05..357a2cf73 100644
--- a/third-party-programs.txt
+++ b/third-party-programs.txt
@@ -1,5 +1,5 @@
 Intel(R) oneAPI Collective Communications Library (oneCCL) 
-2021.10.0 Third Party Programs File
+2021.11.0 Third Party Programs File
 
 This file is the "third-party-programs.txt" file specified in the associated 
 Intel end user license agreement for the Intel software you are licensing.
@@ -652,7 +652,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   The following third party programs have their own third party programs. These 
   additional third party program files are as follows:
   1. Intel(R) Intel(R) MPI Library 
-  <install_dir>/mpi/licensing/third-party-programs.txt
+  <install_dir>/mpi/latest/share/doc/mpi/third-party-programs.txt
 
 -------------------------------------------------------------------------------