From e9d4f1ffb5efd5783933efea4cff0eceeac05ce5 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Thu, 11 Jul 2024 16:41:43 +0000
Subject: [PATCH 1/8] Add overlap communication to read_distributed

---
 .../cuda_hip/distributed/matrix_kernels.cpp   | 28 ++++++
 core/device_hooks/common_kernels.inc.cpp      |  3 +
 core/distributed/matrix.cpp                   | 97 +++++++++++++++++--
 core/distributed/matrix_kernels.hpp           | 31 ++++++
 dpcpp/distributed/matrix_kernels.dp.cpp       | 28 ++++++
 include/ginkgo/core/distributed/matrix.hpp    | 24 ++++-
 omp/distributed/matrix_kernels.cpp            | 28 ++++++
 reference/distributed/matrix_kernels.cpp      | 62 ++++++++++++
 8 files changed, 289 insertions(+), 12 deletions(-)
diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 88988febbb0..70824bf03d9 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -49,6 +49,34 @@ struct input_type {
 };
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part,
+    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs,
+    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 1ba925e94e3..304bed16683 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -274,6 +274,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 namespace distributed_matrix {
 
 
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 63f359cc40a..e4bae04d180 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,6 +4,9 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
+#include <numeric>
+#include <vector>
+
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
@@ -20,6 +23,10 @@ namespace matrix {
 namespace {
 
 
+GKO_REGISTER_OPERATION(count_overlap_entries,
+                       distributed_matrix::count_overlap_entries);
+GKO_REGISTER_OPERATION(fill_overlap_send_buffers,
+                       distributed_matrix::fill_overlap_send_buffers);
 GKO_REGISTER_OPERATION(separate_local_nonlocal,
                        distributed_matrix::separate_local_nonlocal);
 
@@ -243,7 +250,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition)
+        col_partition,
+    assembly assembly_type)
 {
     const auto comm = this->get_communicator();
     GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
@@ -252,6 +260,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts());
     auto exec = this->get_executor();
     auto local_part = comm.rank();
+    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
 
     // set up LinOp sizes
     auto num_parts = static_cast<size_type>(row_partition->get_num_parts());
@@ -260,6 +269,76 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     dim<2> global_dim{global_num_rows, global_num_cols};
     this->set_size(global_dim);
 
+    if (assembly_type == assembly::communicate) {
+        array<comm_index_type> overlap_count{exec, num_parts};
+        overlap_count.fill(0);
+        auto tmp_part = make_temporary_clone(exec, row_partition);
+        exec->run(matrix::make_count_overlap_entries(
+            data, tmp_part.get(), local_part, overlap_count));
+
+        overlap_count.set_executor(exec->get_master());
+        std::vector<comm_index_type> overlap_send_sizes(
+            overlap_count.get_data(), overlap_count.get_data() + num_parts);
+        std::vector<comm_index_type> overlap_send_offsets(num_parts + 1);
+        std::vector<comm_index_type> overlap_recv_sizes(num_parts);
+        std::vector<comm_index_type> overlap_recv_offsets(num_parts + 1);
+
+        std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
+                         overlap_send_offsets.begin() + 1);
+        comm.all_to_all(exec, overlap_send_sizes.data(), 1,
+                        overlap_recv_sizes.data(), 1);
+        std::partial_sum(overlap_recv_sizes.begin(), overlap_recv_sizes.end(),
+                         overlap_recv_offsets.begin() + 1);
+        overlap_send_offsets[0] = 0;
+        overlap_recv_offsets[0] = 0;
+
+        size_type n_send = overlap_send_offsets.back();
+        size_type n_recv = overlap_recv_offsets.back();
+        array<global_index_type> overlap_send_row_idxs{exec, n_send};
+        array<global_index_type> overlap_send_col_idxs{exec, n_send};
+        array<value_type> overlap_send_values{exec, n_send};
+        array<global_index_type> overlap_recv_row_idxs{exec, n_recv};
+        array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
+        array<value_type> overlap_recv_values{exec, n_recv};
+        auto offset_array =
+            make_const_array_view(exec->get_master(), num_parts + 1,
+                                  overlap_send_offsets.data())
+                .copy_to_array();
+        offset_array.set_executor(exec);
+        exec->run(matrix::make_fill_overlap_send_buffers(
+            data, tmp_part.get(), local_part, offset_array,
+            overlap_send_row_idxs, overlap_send_col_idxs, overlap_send_values));
+
+        if (use_host_buffer) {
+            overlap_send_row_idxs.set_executor(exec->get_master());
+            overlap_send_col_idxs.set_executor(exec->get_master());
+            overlap_send_values.set_executor(exec->get_master());
+            overlap_recv_row_idxs.set_executor(exec->get_master());
+            overlap_recv_col_idxs.set_executor(exec->get_master());
+            overlap_recv_values.set_executor(exec->get_master());
+        }
+        comm.all_to_all_v(
+            use_host_buffer ? exec : exec->get_master(),
+            overlap_send_row_idxs.get_const_data(), overlap_send_sizes.data(),
+            overlap_send_offsets.data(), overlap_recv_row_idxs.get_data(),
+            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        comm.all_to_all_v(
+            use_host_buffer ? exec : exec->get_master(),
+            overlap_send_col_idxs.get_const_data(), overlap_send_sizes.data(),
+            overlap_send_offsets.data(), overlap_recv_col_idxs.get_data(),
+            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        comm.all_to_all_v(
+            use_host_buffer ? exec : exec->get_master(),
+            overlap_send_values.get_const_data(), overlap_send_sizes.data(),
+            overlap_send_offsets.data(), overlap_recv_values.get_data(),
+            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        if (use_host_buffer) {
+            overlap_recv_row_idxs.set_executor(exec);
+            overlap_recv_col_idxs.set_executor(exec);
+            overlap_recv_values.set_executor(exec);
+        }
+    }
+
     // temporary storage for the output
     array<local_index_type> local_row_idxs{exec};
     array<local_index_type> local_col_idxs{exec};
@@ -335,7 +414,6 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
             imap.get_executor(), imap.get_non_local_size(),
             imap.get_remote_local_idxs().get_const_flat_data())
             .copy_to_array();
-    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
     if (use_host_buffer) {
         recv_gather_idxs.set_executor(exec->get_master());
         gather_idxs_.clear();
@@ -358,12 +436,13 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        col_partition)
+        col_partition,
+    assembly assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
             this->get_executor(), data),
-        row_partition, col_partition);
+        row_partition, col_partition, assembly_type);
 }
 
 
@@ -371,12 +450,13 @@ template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const matrix_data<ValueType, global_index_type>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        partition)
+        partition,
+    assembly assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
             this->get_executor(), data),
-        partition, partition);
+        partition, partition, assembly_type);
 }
 
 
@@ -384,9 +464,10 @@ template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const device_matrix_data<ValueType, GlobalIndexType>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
-        partition)
+        partition,
+    assembly assembly_type)
 {
-    return this->read_distributed(data, partition, partition);
+    return this->read_distributed(data, partition, partition, assembly_type);
 }
 
 
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index f24e8c9945e..fa7e891eb4e 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,6 +19,29 @@ namespace gko {
 namespace kernels {
 
 
+#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \
+                                          GlobalIndexType)           \
+    void count_overlap_entries(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                 \
+        const device_matrix_data<ValueType, GlobalIndexType>& input, \
+        const experimental::distributed::Partition<                  \
+            LocalIndexType, GlobalIndexType>* row_partition,         \
+        comm_index_type local_part, array<comm_index_type>& overlap_count)
+
+
+#define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \
+                                              GlobalIndexType)           \
+    void fill_overlap_send_buffers(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                     \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,     \
+        const experimental::distributed::Partition<                      \
+            LocalIndexType, GlobalIndexType>* row_partition,             \
+        comm_index_type local_part, array<comm_index_type>& offsets,     \
+        array<GlobalIndexType>& overlap_row_idxs,                        \
+        array<GlobalIndexType>& overlap_col_idxs,                        \
+        array<ValueType>& overlap_values)
+
+
 #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,         \
                                             GlobalIndexType)                   \
     void separate_local_nonlocal(                                              \
@@ -37,6 +60,14 @@ namespace kernels {
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
     using comm_index_type = experimental::distributed::comm_index_type; \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,        \
+                                      GlobalIndexType);                 \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType,    \
+                                          GlobalIndexType);             \
     template <typename ValueType, typename LocalIndexType,              \
               typename GlobalIndexType>                                 \
     GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,      \
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 47adaaeca59..9225e58ad14 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -13,6 +13,34 @@ namespace dpcpp {
 namespace distributed_matrix {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part,
+    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs,
+    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index de719bb9315..51e6e498348 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -133,6 +133,18 @@ namespace experimental {
 namespace distributed {
 
 
+/**
+ * assembly defines how the read_distributed function of the distributed
+ * matrix treats non-local indices in the (device_)matrix_data:
+ * - communicate communicates the overlap between ranks and adds up all local
+ *   contributions. Indices smaller than 0 or larger than the global size
+ *   of the matrix are ignored.
+ * - local_only does not communicate any overlap but ignores all non-local
+ *   indices.
+ */
+enum class assembly { communicate, local_only };
+
+
 template <typename LocalIndexType, typename GlobalIndexType>
 class Partition;
 template <typename ValueType>
@@ -297,7 +309,8 @@ class Matrix
     void read_distributed(
         const device_matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            partition);
+            partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Reads a square matrix from the matrix_data structure and a global
@@ -311,7 +324,8 @@ class Matrix
     void read_distributed(
         const matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            partition);
+            partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Reads a matrix from the device_matrix_data structure, a global row
@@ -335,7 +349,8 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition);
+            col_partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
@@ -351,7 +366,8 @@ class Matrix
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
-            col_partition);
+            col_partition,
+        assembly assembly_type = assembly::local_only);
 
     /**
      * Get read access to the stored local matrix.
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 2f36ec4a778..a3e8cb60868 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -20,6 +20,34 @@ namespace omp {
 namespace distributed_matrix {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part,
+    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs,
+    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 95176b34656..d8b0f9e1d4f 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -7,6 +7,7 @@
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "ginkgo/core/distributed/partition.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -16,6 +17,67 @@ namespace reference {
 namespace distributed_matrix {
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_overlap_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& overlap_count)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto row_part_ids = row_partition->get_part_ids();
+
+    size_type row_range_id = 0;
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        if (row_part_id != local_part) {
+            overlap_count.get_data()[row_part_id]++;
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_overlap_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& offsets,
+    array<GlobalIndexType>& overlap_row_idxs,
+    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+
+    size_type row_range_id = 0;
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        if (row_part_id != local_part) {
+            auto idx = offsets.get_data()[row_part_id]++;
+            overlap_row_idxs.get_data()[idx] = global_row;
+            overlap_col_idxs.get_data()[idx] = input_col_idxs[i];
+            overlap_values.get_data()[idx] = input_vals[i];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,

From 20f01886f1bab04fc13d970972b172e326a7b257 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Tue, 16 Jul 2024 12:14:20 +0000
Subject: [PATCH 2/8] test communicating read_distributed

---
 core/distributed/matrix.cpp | 38 ++++++++++++++++++++------
 test/mpi/matrix.cpp         | 53 ++++++++++++++++++++++++++++++++++---
 2 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index e4bae04d180..a64a07619b3 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -14,6 +14,7 @@
 #include <ginkgo/core/matrix/diagonal.hpp>
 
 #include "core/distributed/matrix_kernels.hpp"
+#include "ginkgo/core/base/mtx_io.hpp"
 
 
 namespace gko {
@@ -263,14 +264,14 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
 
     // set up LinOp sizes
-    auto num_parts = static_cast<size_type>(row_partition->get_num_parts());
     auto global_num_rows = row_partition->get_size();
     auto global_num_cols = col_partition->get_size();
     dim<2> global_dim{global_num_rows, global_num_cols};
     this->set_size(global_dim);
 
+    device_matrix_data<value_type, global_index_type> all_data{exec};
     if (assembly_type == assembly::communicate) {
-        array<comm_index_type> overlap_count{exec, num_parts};
+        array<comm_index_type> overlap_count{exec, comm.size()};
         overlap_count.fill(0);
         auto tmp_part = make_temporary_clone(exec, row_partition);
         exec->run(matrix::make_count_overlap_entries(
@@ -278,10 +279,10 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 
         overlap_count.set_executor(exec->get_master());
         std::vector<comm_index_type> overlap_send_sizes(
-            overlap_count.get_data(), overlap_count.get_data() + num_parts);
-        std::vector<comm_index_type> overlap_send_offsets(num_parts + 1);
-        std::vector<comm_index_type> overlap_recv_sizes(num_parts);
-        std::vector<comm_index_type> overlap_recv_offsets(num_parts + 1);
+            overlap_count.get_data(), overlap_count.get_data() + comm.size());
+        std::vector<comm_index_type> overlap_send_offsets(comm.size() + 1);
+        std::vector<comm_index_type> overlap_recv_sizes(comm.size());
+        std::vector<comm_index_type> overlap_recv_offsets(comm.size() + 1);
 
         std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
                          overlap_send_offsets.begin() + 1);
@@ -301,7 +302,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
         array<value_type> overlap_recv_values{exec, n_recv};
         auto offset_array =
-            make_const_array_view(exec->get_master(), num_parts + 1,
+            make_const_array_view(exec->get_master(), comm.size() + 1,
                                   overlap_send_offsets.data())
                 .copy_to_array();
         offset_array.set_executor(exec);
@@ -337,6 +338,26 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
             overlap_recv_col_idxs.set_executor(exec);
             overlap_recv_values.set_executor(exec);
         }
+
+        size_type n_nnz = data.get_num_stored_elements();
+        array<global_index_type> all_row_idxs{exec, n_nnz + n_recv};
+        array<global_index_type> all_col_idxs{exec, n_nnz + n_recv};
+        array<value_type> all_values{exec, n_nnz + n_recv};
+        exec->copy_from(exec, n_nnz, data.get_const_row_idxs(),
+                        all_row_idxs.get_data());
+        exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(),
+                        all_row_idxs.get_data() + n_nnz);
+        exec->copy_from(exec, n_nnz, data.get_const_col_idxs(),
+                        all_col_idxs.get_data());
+        exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(),
+                        all_col_idxs.get_data() + n_nnz);
+        exec->copy_from(exec, n_nnz, data.get_const_values(),
+                        all_values.get_data());
+        exec->copy_from(exec, n_recv, overlap_recv_values.get_data(),
+                        all_values.get_data() + n_nnz);
+        all_data = device_matrix_data<value_type, global_index_type>{
+            exec, global_dim, all_row_idxs, all_col_idxs, all_values};
+        all_data.sum_duplicates();
     }
 
     // temporary storage for the output
@@ -352,7 +373,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     // as well as the rows of the non-local block. The columns of the non-local
     // block are still in global indices.
     exec->run(matrix::make_separate_local_nonlocal(
-        data, make_temporary_clone(exec, row_partition).get(),
+        assembly_type == assembly::communicate ? all_data : data,
+        make_temporary_clone(exec, row_partition).get(),
         make_temporary_clone(exec, col_partition).get(), local_part,
         local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
         global_non_local_col_idxs, non_local_values));
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index f4b8af2fb19..1b25ad7eb6d 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -60,9 +60,16 @@ class MatrixCreation : public CommonMpiTestFixture {
                      {3, 4, 7},
                      {4, 0, 9},
                      {4, 4, 10}}},
-          dist_input{{{size, {{0, 1, 1}, {0, 3, 2}, {1, 1, 3}, {1, 2, 4}}},
-                      {size, {{2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}},
-                      {size, {{4, 0, 9}, {4, 4, 10}}}}},
+          dist_input{
+              {{size,
+                {{0, 1, 1},
+                 {0, 3, 2},
+                 {1, 1, 3},
+                 {1, 2, 4},
+                 {2, 0, 1},
+                 {2, 3, 1}}},
+               {size, {{0, 0, 1}, {2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}},
+               {size, {{2, 2, 1}, {3, 3, -1}, {4, 0, 9}, {4, 4, 10}}}}},
           engine(42)
     {
         row_part = Partition::build_from_contiguous(
@@ -134,6 +141,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalData)
 }
 
 
+TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{1, 1}, {0, 3}}, {{7, 1}, {0, 7}}, {{10}}};
+    I<I<value_type>> res_non_local[] = {
+        {{0, 2}, {4, 0}}, {{1, 5, 0}, {0, 0, 7}}, {{9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(
+        this->dist_input[rank], this->row_part,
+        gko::experimental::distributed::assembly::communicate);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
 TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
 {
     using value_type = typename TestFixture::value_type;
@@ -153,6 +180,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
 }
 
 
+TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{2, 0}, {0, 0}}, {{1, 5}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_non_local[] = {
+        {{1, 1, 0}, {0, 3, 4}}, {{1, 0, 7}, {7, 7, 0}}, {{10, 9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(
+        this->dist_input[rank], this->row_part, this->col_part,
+        gko::experimental::distributed::assembly::communicate);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
 TYPED_TEST(MatrixCreation, BuildOnlyLocal)
 {
     using value_type = typename TestFixture::value_type;

From e193811ea4c3b3552243f4753a3d410200f5454f Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Wed, 17 Jul 2024 12:28:11 +0000
Subject: [PATCH 3/8] Add reference kernel tests

---
 reference/test/distributed/matrix_kernels.cpp | 81 +++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index a34844cbde9..00d3fcd8895 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -17,6 +17,8 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
+#include "ginkgo/core/base/array.hpp"
+#include "ginkgo/core/base/types.hpp"
 
 
 namespace {
@@ -186,6 +188,85 @@ TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes,
                  TupleTypenameNameGenerator);
 
 
+TYPED_TEST(Matrix, CountOverlapEntries)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    using ca = gko::array<comm_index_type>;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
+    std::vector<ca> overlap_count_ref{
+        ca{this->ref, I<comm_index_type>{0, 5, 3}},
+        ca{this->ref, I<comm_index_type>{4, 0, 3}},
+        ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    auto input = this->create_input_full_rank();
+
+    gko::array<comm_index_type> overlap_count{
+        this->ref, static_cast<gko::size_type>(num_parts)};
+    for (gko::size_type i = 0; i < num_parts; i++) {
+        overlap_count.fill(0);
+        gko::kernels::reference::distributed_matrix::count_overlap_entries(
+            this->ref, input, partition.get(), i, overlap_count);
+        GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]);
+    }
+}
+
+
+TYPED_TEST(Matrix, FillOverlapSendBuffers)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    using ca = gko::array<comm_index_type>;
+    using ga = gko::array<git>;
+    using va = gko::array<vt>;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
+    std::vector<ca> overlap_offsets{
+        ca{this->ref, I<comm_index_type>{0, 0, 5, 8}},
+        ca{this->ref, I<comm_index_type>{0, 4, 4, 7}},
+        ca{this->ref, I<comm_index_type>{0, 4, 9, 9}}};
+    std::vector<ga> overlap_row_idxs_ref{
+        ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
+        ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
+        ga{this->ref, I<git>{1, 1, 4, 4, 0, 0, 5, 5, 6}}};
+    std::vector<ga> overlap_col_idxs_ref{
+        ga{this->ref, I<git>{0, 3, 4, 5, 5, 2, 0, 3}},
+        ga{this->ref, I<git>{1, 2, 4, 6, 2, 0, 3}},
+        ga{this->ref, I<git>{1, 2, 4, 6, 0, 3, 4, 5, 5}}};
+    std::vector<va> overlap_values_ref{
+        va{this->ref, I<vt>{1, 2, 10, 11, 12, 5, 6, 7}},
+        va{this->ref, I<vt>{3, 4, 8, 9, 5, 6, 7}},
+        va{this->ref, I<vt>{3, 4, 8, 9, 1, 2, 10, 11, 12}}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    auto input = this->create_input_full_rank();
+
+    gko::array<git> overlap_row_idxs{this->ref};
+    gko::array<git> overlap_col_idxs{this->ref};
+    gko::array<vt> overlap_values{this->ref};
+    for (gko::size_type i = 0; i < num_parts; i++) {
+        overlap_row_idxs.resize_and_reset(
+            overlap_offsets[i].get_data()[num_parts]);
+        overlap_col_idxs.resize_and_reset(
+            overlap_offsets[i].get_data()[num_parts]);
+        overlap_values.resize_and_reset(
+            overlap_offsets[i].get_data()[num_parts]);
+        gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers(
+            this->ref, input, partition.get(), i, overlap_offsets[i],
+            overlap_row_idxs, overlap_col_idxs, overlap_values);
+        GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]);
+    }
+}
+
+
 TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty)
 {
     using lit = typename TestFixture::local_index_type;

From 769066a0ba729c5b3a5bf509f7aee9fb7412b39c Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Fri, 19 Jul 2024 12:20:14 +0000
Subject: [PATCH 4/8] Add device kernels and tests

---
 .../cuda_hip/distributed/matrix_kernels.cpp   | 94 ++++++++++++++++++-
 core/distributed/matrix.cpp                   | 51 +++++-----
 core/distributed/matrix_kernels.hpp           | 22 +++--
 dpcpp/distributed/matrix_kernels.dp.cpp       |  8 +-
 omp/distributed/matrix_kernels.cpp            | 70 +++++++++++++-
 reference/distributed/matrix_kernels.cpp      | 54 ++++++++---
 reference/test/distributed/matrix_kernels.cpp | 48 ++++++----
 test/distributed/matrix_kernels.cpp           | 60 +++++++++++-
 8 files changed, 324 insertions(+), 83 deletions(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 70824bf03d9..ab3ec9da8b1 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -20,6 +20,9 @@
 
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
 
 namespace gko {
@@ -55,8 +58,67 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part,
-    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto row_part_ids = row_partition->get_part_ids();
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_stored_elements();
+
+    auto policy = thrust_policy(exec);
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(policy, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids,
+                      auto part_ids_per_entry, auto orig_positions) {
+            part_ids_per_entry[i] = part_ids[range_ids[i]];
+            orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i;
+        },
+        num_input_elements, local_part, row_part_ids, row_range_ids.get_data(),
+        row_part_ids_per_entry.get_data(), original_positions.get_data());
+
+    thrust::stable_sort_by_key(
+        policy, row_part_ids_per_entry.get_data(),
+        row_part_ids_per_entry.get_data() + num_input_elements,
+        original_positions.get_data());
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto orig_positions, auto overl_positions) {
+            overl_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
+        },
+        num_input_elements, original_positions.get_const_data(),
+        overlap_positions.get_data());
+
+    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+                                       num_input_elements);
+    size_type num_parts = row_partition->get_num_parts();
+    array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
+    row_part_ptrs.fill(0);
+    components::convert_idxs_to_ptrs(
+        exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
+        num_parts, row_part_ptrs.get_data());
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
+            count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
+        },
+        num_parts, local_part, row_part_ptrs.get_data(),
+        overlap_count.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
@@ -68,10 +130,32 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs,
-    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+{
+    auto num_entries = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_values = input.get_const_values();
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
+                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
+                      auto out_vals) {
+            if (in_pos[i] >= 0) {
+                out_rows[out_pos[i]] = in_rows[in_pos[i]];
+                out_cols[out_pos[i]] = in_cols[in_pos[i]];
+                out_vals[out_pos[i]] = in_vals[in_pos[i]];
+            }
+        },
+        num_entries, input_row_idxs, input_col_idxs, input_values,
+        original_positions.get_const_data(), overlap_positions.get_const_data(),
+        overlap_row_idxs.get_data(), overlap_col_idxs.get_data(),
+        overlap_values.get_data());
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index a64a07619b3..3799895abf3 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,17 +4,14 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
-#include <numeric>
-#include <vector>
-
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>
 #include <ginkgo/core/matrix/csr.hpp>
 #include <ginkgo/core/matrix/diagonal.hpp>
 
+#include "core/components/prefix_sum_kernels.hpp"
 #include "core/distributed/matrix_kernels.hpp"
-#include "ginkgo/core/base/mtx_io.hpp"
 
 
 namespace gko {
@@ -271,18 +268,23 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
 
     device_matrix_data<value_type, global_index_type> all_data{exec};
     if (assembly_type == assembly::communicate) {
-        array<comm_index_type> overlap_count{exec, comm.size()};
+        size_type num_entries = data.get_num_stored_elements();
+        size_type num_parts = comm.size();
+        array<comm_index_type> overlap_count{exec, num_parts};
+        array<global_index_type> overlap_positions{exec, num_entries};
+        array<global_index_type> original_positions{exec, num_entries};
         overlap_count.fill(0);
         auto tmp_part = make_temporary_clone(exec, row_partition);
         exec->run(matrix::make_count_overlap_entries(
-            data, tmp_part.get(), local_part, overlap_count));
+            data, tmp_part.get(), local_part, overlap_count, overlap_positions,
+            original_positions));
 
         overlap_count.set_executor(exec->get_master());
         std::vector<comm_index_type> overlap_send_sizes(
-            overlap_count.get_data(), overlap_count.get_data() + comm.size());
-        std::vector<comm_index_type> overlap_send_offsets(comm.size() + 1);
-        std::vector<comm_index_type> overlap_recv_sizes(comm.size());
-        std::vector<comm_index_type> overlap_recv_offsets(comm.size() + 1);
+            overlap_count.get_data(), overlap_count.get_data() + num_parts);
+        std::vector<comm_index_type> overlap_send_offsets(num_parts + 1);
+        std::vector<comm_index_type> overlap_recv_sizes(num_parts);
+        std::vector<comm_index_type> overlap_recv_offsets(num_parts + 1);
 
         std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
                          overlap_send_offsets.begin() + 1);
@@ -301,14 +303,10 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<global_index_type> overlap_recv_row_idxs{exec, n_recv};
         array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
         array<value_type> overlap_recv_values{exec, n_recv};
-        auto offset_array =
-            make_const_array_view(exec->get_master(), comm.size() + 1,
-                                  overlap_send_offsets.data())
-                .copy_to_array();
-        offset_array.set_executor(exec);
         exec->run(matrix::make_fill_overlap_send_buffers(
-            data, tmp_part.get(), local_part, offset_array,
-            overlap_send_row_idxs, overlap_send_col_idxs, overlap_send_values));
+            data, tmp_part.get(), local_part, overlap_positions,
+            original_positions, overlap_send_row_idxs, overlap_send_col_idxs,
+            overlap_send_values));
 
         if (use_host_buffer) {
             overlap_send_row_idxs.set_executor(exec->get_master());
@@ -339,22 +337,21 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
             overlap_recv_values.set_executor(exec);
         }
 
-        size_type n_nnz = data.get_num_stored_elements();
-        array<global_index_type> all_row_idxs{exec, n_nnz + n_recv};
-        array<global_index_type> all_col_idxs{exec, n_nnz + n_recv};
-        array<value_type> all_values{exec, n_nnz + n_recv};
-        exec->copy_from(exec, n_nnz, data.get_const_row_idxs(),
+        array<global_index_type> all_row_idxs{exec, num_entries + n_recv};
+        array<global_index_type> all_col_idxs{exec, num_entries + n_recv};
+        array<value_type> all_values{exec, num_entries + n_recv};
+        exec->copy_from(exec, num_entries, data.get_const_row_idxs(),
                         all_row_idxs.get_data());
         exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(),
-                        all_row_idxs.get_data() + n_nnz);
-        exec->copy_from(exec, n_nnz, data.get_const_col_idxs(),
+                        all_row_idxs.get_data() + num_entries);
+        exec->copy_from(exec, num_entries, data.get_const_col_idxs(),
                         all_col_idxs.get_data());
         exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(),
-                        all_col_idxs.get_data() + n_nnz);
-        exec->copy_from(exec, n_nnz, data.get_const_values(),
+                        all_col_idxs.get_data() + num_entries);
+        exec->copy_from(exec, num_entries, data.get_const_values(),
                         all_values.get_data());
         exec->copy_from(exec, n_recv, overlap_recv_values.get_data(),
-                        all_values.get_data() + n_nnz);
+                        all_values.get_data() + num_entries);
         all_data = device_matrix_data<value_type, global_index_type>{
             exec, global_dim, all_row_idxs, all_col_idxs, all_values};
         all_data.sum_duplicates();
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index fa7e891eb4e..4cdaf3e17fe 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,14 +19,16 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \
-                                          GlobalIndexType)           \
-    void count_overlap_entries(                                      \
-        std::shared_ptr<const DefaultExecutor> exec,                 \
-        const device_matrix_data<ValueType, GlobalIndexType>& input, \
-        const experimental::distributed::Partition<                  \
-            LocalIndexType, GlobalIndexType>* row_partition,         \
-        comm_index_type local_part, array<comm_index_type>& overlap_count)
+#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,       \
+                                          GlobalIndexType)                 \
+    void count_overlap_entries(                                            \
+        std::shared_ptr<const DefaultExecutor> exec,                       \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,       \
+        const experimental::distributed::Partition<                        \
+            LocalIndexType, GlobalIndexType>* row_partition,               \
+        comm_index_type local_part, array<comm_index_type>& overlap_count, \
+        array<GlobalIndexType>& overlap_positions,                         \
+        array<GlobalIndexType>& original_positions)
 
 
 #define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \
@@ -36,7 +38,9 @@ namespace kernels {
         const device_matrix_data<ValueType, GlobalIndexType>& input,     \
         const experimental::distributed::Partition<                      \
             LocalIndexType, GlobalIndexType>* row_partition,             \
-        comm_index_type local_part, array<comm_index_type>& offsets,     \
+        comm_index_type local_part,                                      \
+        const array<GlobalIndexType>& overlap_positions,                 \
+        const array<GlobalIndexType>& original_positions,                \
         array<GlobalIndexType>& overlap_row_idxs,                        \
         array<GlobalIndexType>& overlap_col_idxs,                        \
         array<ValueType>& overlap_values)
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 9225e58ad14..60fc0686473 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -19,8 +19,9 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part,
-    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
@@ -32,7 +33,8 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
     array<GlobalIndexType>& overlap_col_idxs,
     array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index a3e8cb60868..55ee5524116 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -4,6 +4,8 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
+#include <algorithm>
+
 #include <omp.h>
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -26,8 +28,50 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part,
-    array<comm_index_type>& overlap_count) GKO_NOT_IMPLEMENTED;
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto num_input_elements = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto row_part_ids = row_partition->get_part_ids();
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+
+    size_type row_range_id = 0;
+#pragma omp parallel for firstprivate(row_range_id)
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        auto row_part_id = row_part_ids[row_range_id];
+        row_part_ids_per_entry.get_data()[i] = row_part_id;
+        if (row_part_id != local_part) {
+#pragma omp atomic
+            overlap_count.get_data()[row_part_id]++;
+            original_positions.get_data()[i] = i;
+        } else {
+            original_positions.get_data()[i] = -1;
+        }
+    }
+
+    auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) {
+        comm_index_type a =
+            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
+        comm_index_type b =
+            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
+        return a < b;
+    };
+    std::stable_sort(original_positions.get_data(),
+                     original_positions.get_data() + num_input_elements, comp);
+
+#pragma omp parallel for
+    for (size_type i = 0; i < num_input_elements; i++) {
+        overlap_positions.get_data()[i] =
+            original_positions.get_const_data()[i] == -1 ? 0 : 1;
+    }
+
+    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+                                       num_input_elements);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
@@ -39,10 +83,26 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs,
-    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+
+#pragma omp parallel for
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto in_pos = original_positions.get_const_data()[i];
+        if (in_pos >= 0) {
+            auto out_pos = overlap_positions.get_const_data()[i];
+            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            overlap_values.get_data()[out_pos] = input_vals[in_pos];
+        }
+    }
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index d8b0f9e1d4f..6a57a64e075 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -4,10 +4,12 @@
 
 #include "core/distributed/matrix_kernels.hpp"
 
+#include <algorithm>
+#include <numeric>
+
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
-#include "ginkgo/core/distributed/partition.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -23,21 +25,47 @@ void count_overlap_entries(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count)
+    comm_index_type local_part, array<comm_index_type>& overlap_count,
+    array<GlobalIndexType>& overlap_positions,
+    array<GlobalIndexType>& original_positions)
 {
+    auto num_input_elements = input.get_num_stored_elements();
     auto input_row_idxs = input.get_const_row_idxs();
     auto row_part_ids = row_partition->get_part_ids();
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
 
     size_type row_range_id = 0;
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
         auto global_row = input_row_idxs[i];
         row_range_id = find_range(global_row, row_partition, row_range_id);
-        row_range_id = find_range(global_row, row_partition, row_range_id);
         auto row_part_id = row_part_ids[row_range_id];
+        row_part_ids_per_entry.get_data()[i] = row_part_id;
         if (row_part_id != local_part) {
             overlap_count.get_data()[row_part_id]++;
+            original_positions.get_data()[i] = i;
+        } else {
+            original_positions.get_data()[i] = -1;
         }
     }
+
+    auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) {
+        comm_index_type a =
+            i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i];
+        comm_index_type b =
+            j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j];
+        return a < b;
+    };
+
+    std::stable_sort(original_positions.get_data(),
+                     original_positions.get_data() + num_input_elements, comp);
+    for (size_type i = 0; i < num_input_elements; i++) {
+        overlap_positions.get_data()[i] =
+            original_positions.get_const_data()[i] == -1 ? 0 : 1;
+    }
+
+    std::exclusive_scan(overlap_positions.get_data(),
+                        overlap_positions.get_data() + num_input_elements,
+                        overlap_positions.get_data(), 0);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
@@ -50,26 +78,22 @@ void fill_overlap_send_buffers(
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& offsets,
+    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    const array<GlobalIndexType>& original_positions,
     array<GlobalIndexType>& overlap_row_idxs,
     array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
 {
     auto input_row_idxs = input.get_const_row_idxs();
     auto input_col_idxs = input.get_const_col_idxs();
     auto input_vals = input.get_const_values();
-    auto row_part_ids = row_partition->get_part_ids();
 
-    size_type row_range_id = 0;
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
-        auto global_row = input_row_idxs[i];
-        row_range_id = find_range(global_row, row_partition, row_range_id);
-        row_range_id = find_range(global_row, row_partition, row_range_id);
-        auto row_part_id = row_part_ids[row_range_id];
-        if (row_part_id != local_part) {
-            auto idx = offsets.get_data()[row_part_id]++;
-            overlap_row_idxs.get_data()[idx] = global_row;
-            overlap_col_idxs.get_data()[idx] = input_col_idxs[i];
-            overlap_values.get_data()[idx] = input_vals[i];
+        auto in_pos = original_positions.get_const_data()[i];
+        if (in_pos >= 0) {
+            auto out_pos = overlap_positions.get_const_data()[i];
+            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            overlap_values.get_data()[out_pos] = input_vals[in_pos];
         }
     }
 }
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index 00d3fcd8895..80fc8eb3330 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -17,8 +17,6 @@
 #include <ginkgo/core/matrix/csr.hpp>
 
 #include "core/test/utils.hpp"
-#include "ginkgo/core/base/array.hpp"
-#include "ginkgo/core/base/types.hpp"
 
 
 namespace {
@@ -194,24 +192,37 @@ TYPED_TEST(Matrix, CountOverlapEntries)
     using git = typename TestFixture::global_index_type;
     using vt = typename TestFixture::value_type;
     using ca = gko::array<comm_index_type>;
+    using ga = gko::array<git>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
     std::vector<ca> overlap_count_ref{
         ca{this->ref, I<comm_index_type>{0, 5, 3}},
         ca{this->ref, I<comm_index_type>{4, 0, 3}},
         ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    std::vector<ga> overlap_pos_ref{
+        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
+    std::vector<ga> original_pos_ref{
+        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
     comm_index_type num_parts = 3;
     auto partition =
         gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
             this->ref, this->mapping, num_parts);
     auto input = this->create_input_full_rank();
 
-    gko::array<comm_index_type> overlap_count{
-        this->ref, static_cast<gko::size_type>(num_parts)};
+    ca overlap_count{this->ref, static_cast<gko::size_type>(num_parts)};
+    ga overlap_positions{this->ref, input.get_num_stored_elements()};
+    ga original_positions{this->ref, input.get_num_stored_elements()};
     for (gko::size_type i = 0; i < num_parts; i++) {
         overlap_count.fill(0);
         gko::kernels::reference::distributed_matrix::count_overlap_entries(
-            this->ref, input, partition.get(), i, overlap_count);
+            this->ref, input, partition.get(), i, overlap_count,
+            overlap_positions, original_positions);
         GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(overlap_positions, overlap_pos_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]);
     }
 }
 
@@ -225,10 +236,14 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
     using ga = gko::array<git>;
     using va = gko::array<vt>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ca> overlap_offsets{
-        ca{this->ref, I<comm_index_type>{0, 0, 5, 8}},
-        ca{this->ref, I<comm_index_type>{0, 4, 4, 7}},
-        ca{this->ref, I<comm_index_type>{0, 4, 9, 9}}};
+    std::vector<ga> overlap_positions{
+        ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
+        ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
+    std::vector<ga> original_positions{
+        ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
+        ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
     std::vector<ga> overlap_row_idxs_ref{
         ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
         ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
@@ -251,15 +266,14 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
     gko::array<git> overlap_col_idxs{this->ref};
     gko::array<vt> overlap_values{this->ref};
     for (gko::size_type i = 0; i < num_parts; i++) {
-        overlap_row_idxs.resize_and_reset(
-            overlap_offsets[i].get_data()[num_parts]);
-        overlap_col_idxs.resize_and_reset(
-            overlap_offsets[i].get_data()[num_parts]);
-        overlap_values.resize_and_reset(
-            overlap_offsets[i].get_data()[num_parts]);
+        auto num_entries = overlap_row_idxs_ref[i].get_size();
+        overlap_row_idxs.resize_and_reset(num_entries);
+        overlap_col_idxs.resize_and_reset(num_entries);
+        overlap_values.resize_and_reset(num_entries);
         gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers(
-            this->ref, input, partition.get(), i, overlap_offsets[i],
-            overlap_row_idxs, overlap_col_idxs, overlap_values);
+            this->ref, input, partition.get(), i, overlap_positions[i],
+            original_positions[i], overlap_row_idxs, overlap_col_idxs,
+            overlap_values);
         GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]);
         GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]);
         GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]);
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index ad91d699496..6de772e8006 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -48,8 +48,9 @@ class Matrix : public CommonTestFixture {
     {
         gko::device_matrix_data<value_type, global_index_type> d_input{exec,
                                                                        input};
-        for (comm_index_type part = 0; part < row_partition->get_num_parts();
-             ++part) {
+        gko::size_type num_parts = row_partition->get_num_parts();
+        gko::size_type num_entries = input.get_num_stored_elements();
+        for (comm_index_type part = 0; part < num_parts; ++part) {
             gko::array<local_index_type> local_row_idxs{ref};
             gko::array<local_index_type> local_col_idxs{ref};
             gko::array<value_type> local_values{ref};
@@ -62,6 +63,55 @@ class Matrix : public CommonTestFixture {
             gko::array<local_index_type> d_non_local_row_idxs{exec};
             gko::array<global_index_type> d_non_local_col_idxs{exec};
             gko::array<value_type> d_non_local_values{exec};
+            gko::array<comm_index_type> overlap_count{ref, num_parts};
+            overlap_count.fill(0);
+            gko::array<comm_index_type> d_overlap_count{exec, num_parts};
+            d_overlap_count.fill(0);
+            gko::array<global_index_type> overlap_positions{ref, num_entries};
+            gko::array<global_index_type> d_overlap_positions{exec,
+                                                              num_entries};
+            gko::array<global_index_type> original_positions{ref, num_entries};
+            gko::array<global_index_type> d_original_positions{exec,
+                                                               num_entries};
+
+            gko::kernels::reference::distributed_matrix::count_overlap_entries(
+                ref, input, row_partition.get(), part, overlap_count,
+                overlap_positions, original_positions);
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
+                count_overlap_entries(
+                    exec, d_input, d_row_partition.get(), part, d_overlap_count,
+                    d_overlap_positions, d_original_positions);
+
+            gko::array<global_index_type> overlap_offsets{ref, num_parts + 1};
+            std::partial_sum(overlap_count.get_data(),
+                             overlap_count.get_data() + num_parts,
+                             overlap_offsets.get_data() + 1);
+            overlap_offsets.get_data()[0] = 0;
+            gko::array<global_index_type> d_overlap_offsets{exec,
+                                                            overlap_offsets};
+            gko::size_type num_overlap_entries =
+                overlap_offsets.get_data()[num_parts];
+            gko::array<global_index_type> overlap_row_idxs{ref,
+                                                           num_overlap_entries};
+            gko::array<global_index_type> overlap_col_idxs{ref,
+                                                           num_overlap_entries};
+            gko::array<value_type> overlap_values{ref, num_overlap_entries};
+            gko::array<global_index_type> d_overlap_row_idxs{
+                exec, num_overlap_entries};
+            gko::array<global_index_type> d_overlap_col_idxs{
+                exec, num_overlap_entries};
+            gko::array<value_type> d_overlap_values{exec, num_overlap_entries};
+
+            gko::kernels::reference::distributed_matrix::
+                fill_overlap_send_buffers(ref, input, row_partition.get(), part,
+                                          overlap_positions, original_positions,
+                                          overlap_row_idxs, overlap_col_idxs,
+                                          overlap_values);
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
+                fill_overlap_send_buffers(
+                    exec, d_input, d_row_partition.get(), part,
+                    d_overlap_positions, d_original_positions,
+                    d_overlap_row_idxs, d_overlap_col_idxs, d_overlap_values);
 
             gko::kernels::reference::distributed_matrix::
                 separate_local_nonlocal(
@@ -75,6 +125,12 @@ class Matrix : public CommonTestFixture {
                     d_non_local_row_idxs, d_non_local_col_idxs,
                     d_non_local_values);
 
+            GKO_ASSERT_ARRAY_EQ(overlap_positions, d_overlap_positions);
+            GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
+            GKO_ASSERT_ARRAY_EQ(overlap_count, d_overlap_count);
+            GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, d_overlap_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, d_overlap_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(overlap_values, d_overlap_values);
             GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs);
             GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs);
             GKO_ASSERT_ARRAY_EQ(local_values, d_local_values);

From 69d8d860f32adbb2352f7dd46084e0f0bbef94af Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Fri, 18 Oct 2024 17:45:52 +0200
Subject: [PATCH 5/8] Address Review comments

---
 .../cuda_hip/distributed/matrix_kernels.cpp   |  35 ++--
 core/device_hooks/common_kernels.inc.cpp      |   4 +-
 core/distributed/matrix.cpp                   | 149 +++++++++---------
 core/distributed/matrix_kernels.hpp           |  68 ++++----
 dpcpp/distributed/matrix_kernels.dp.cpp       |  20 +--
 include/ginkgo/core/distributed/matrix.hpp    |  12 +-
 omp/distributed/matrix_kernels.cpp            |  32 ++--
 reference/distributed/matrix_kernels.cpp      |  36 ++---
 reference/test/distributed/matrix_kernels.cpp |  66 ++++----
 test/distributed/matrix_kernels.cpp           |  91 +++++------
 test/mpi/matrix.cpp                           |   4 +-
 11 files changed, 253 insertions(+), 264 deletions(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index ab3ec9da8b1..159ddbce296 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -53,13 +53,13 @@ struct input_type {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions)
 {
     auto row_part_ids = row_partition->get_part_ids();
@@ -96,13 +96,13 @@ void count_overlap_entries(
         original_positions.get_data());
     run_kernel(
         exec,
-        [] GKO_KERNEL(auto i, auto orig_positions, auto overl_positions) {
-            overl_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
+        [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) {
+            s_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
         },
         num_input_elements, original_positions.get_const_data(),
-        overlap_positions.get_data());
+        send_positions.get_data());
 
-    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
                                        num_input_elements);
     size_type num_parts = row_partition->get_num_parts();
     array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
@@ -116,24 +116,23 @@ void count_overlap_entries(
         [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
             count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
         },
-        num_parts, local_part, row_part_ptrs.get_data(),
-        overlap_count.get_data());
+        num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 {
     auto num_entries = input.get_num_stored_elements();
     auto input_row_idxs = input.get_const_row_idxs();
@@ -152,13 +151,13 @@ void fill_overlap_send_buffers(
             }
         },
         num_entries, input_row_idxs, input_col_idxs, input_values,
-        original_positions.get_const_data(), overlap_positions.get_const_data(),
-        overlap_row_idxs.get_data(), overlap_col_idxs.get_data(),
-        overlap_values.get_data());
+        original_positions.get_const_data(), send_positions.get_const_data(),
+        send_row_idxs.get_data(), send_col_idxs.get_data(),
+        send_values.get_data());
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 304bed16683..746187330f1 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -274,9 +274,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
 namespace distributed_matrix {
 
 
-GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS);
 GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL);
 
 
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 3799895abf3..2c174f26806 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -21,10 +21,10 @@ namespace matrix {
 namespace {
 
 
-GKO_REGISTER_OPERATION(count_overlap_entries,
-                       distributed_matrix::count_overlap_entries);
-GKO_REGISTER_OPERATION(fill_overlap_send_buffers,
-                       distributed_matrix::fill_overlap_send_buffers);
+GKO_REGISTER_OPERATION(count_non_owning_entries,
+                       distributed_matrix::count_non_owning_entries);
+GKO_REGISTER_OPERATION(fill_send_buffers,
+                       distributed_matrix::fill_send_buffers);
 GKO_REGISTER_OPERATION(separate_local_nonlocal,
                        distributed_matrix::separate_local_nonlocal);
 
@@ -249,7 +249,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         col_partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     const auto comm = this->get_communicator();
     GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
@@ -259,6 +259,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     auto exec = this->get_executor();
     auto local_part = comm.rank();
     auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
+    auto tmp_row_partition = make_temporary_clone(exec, row_partition);
+    auto tmp_col_partition = make_temporary_clone(exec, col_partition);
 
     // set up LinOp sizes
     auto global_num_rows = row_partition->get_size();
@@ -267,74 +269,69 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     this->set_size(global_dim);
 
     device_matrix_data<value_type, global_index_type> all_data{exec};
-    if (assembly_type == assembly::communicate) {
+    if (assembly_type == assembly_mode::communicate) {
         size_type num_entries = data.get_num_stored_elements();
         size_type num_parts = comm.size();
-        array<comm_index_type> overlap_count{exec, num_parts};
-        array<global_index_type> overlap_positions{exec, num_entries};
+        array<comm_index_type> send_sizes{exec, num_parts};
+        array<global_index_type> send_positions{exec, num_entries};
         array<global_index_type> original_positions{exec, num_entries};
-        overlap_count.fill(0);
-        auto tmp_part = make_temporary_clone(exec, row_partition);
-        exec->run(matrix::make_count_overlap_entries(
-            data, tmp_part.get(), local_part, overlap_count, overlap_positions,
-            original_positions));
-
-        overlap_count.set_executor(exec->get_master());
-        std::vector<comm_index_type> overlap_send_sizes(
-            overlap_count.get_data(), overlap_count.get_data() + num_parts);
-        std::vector<comm_index_type> overlap_send_offsets(num_parts + 1);
-        std::vector<comm_index_type> overlap_recv_sizes(num_parts);
-        std::vector<comm_index_type> overlap_recv_offsets(num_parts + 1);
-
-        std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(),
-                         overlap_send_offsets.begin() + 1);
-        comm.all_to_all(exec, overlap_send_sizes.data(), 1,
-                        overlap_recv_sizes.data(), 1);
-        std::partial_sum(overlap_recv_sizes.begin(), overlap_recv_sizes.end(),
-                         overlap_recv_offsets.begin() + 1);
-        overlap_send_offsets[0] = 0;
-        overlap_recv_offsets[0] = 0;
-
-        size_type n_send = overlap_send_offsets.back();
-        size_type n_recv = overlap_recv_offsets.back();
-        array<global_index_type> overlap_send_row_idxs{exec, n_send};
-        array<global_index_type> overlap_send_col_idxs{exec, n_send};
-        array<value_type> overlap_send_values{exec, n_send};
-        array<global_index_type> overlap_recv_row_idxs{exec, n_recv};
-        array<global_index_type> overlap_recv_col_idxs{exec, n_recv};
-        array<value_type> overlap_recv_values{exec, n_recv};
-        exec->run(matrix::make_fill_overlap_send_buffers(
-            data, tmp_part.get(), local_part, overlap_positions,
-            original_positions, overlap_send_row_idxs, overlap_send_col_idxs,
-            overlap_send_values));
+        send_sizes.fill(0);
+        exec->run(matrix::make_count_non_owning_entries(
+            data, tmp_row_partition.get(), local_part, send_sizes,
+            send_positions, original_positions));
+
+        send_sizes.set_executor(exec->get_master());
+        array<comm_index_type> send_offsets{exec->get_master(), num_parts + 1};
+        array<comm_index_type> recv_sizes{exec->get_master(), num_parts};
+        array<comm_index_type> recv_offsets{exec->get_master(), num_parts + 1};
+
+        std::partial_sum(send_sizes.get_data(),
+                         send_sizes.get_data() + num_parts,
+                         send_offsets.get_data() + 1);
+        comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(),
+                        1);
+        std::partial_sum(recv_sizes.get_data(),
+                         recv_sizes.get_data() + num_parts,
+                         recv_offsets.get_data() + 1);
+        send_offsets.get_data()[0] = 0;
+        recv_offsets.get_data()[0] = 0;
+
+        size_type n_send = send_offsets.get_data()[num_parts];
+        size_type n_recv = recv_offsets.get_data()[num_parts];
+        array<global_index_type> send_row_idxs{exec, n_send};
+        array<global_index_type> send_col_idxs{exec, n_send};
+        array<value_type> send_values{exec, n_send};
+        array<global_index_type> recv_row_idxs{exec, n_recv};
+        array<global_index_type> recv_col_idxs{exec, n_recv};
+        array<value_type> recv_values{exec, n_recv};
+        exec->run(matrix::make_fill_send_buffers(
+            data, tmp_row_partition.get(), local_part, send_positions,
+            original_positions, send_row_idxs, send_col_idxs, send_values));
 
         if (use_host_buffer) {
-            overlap_send_row_idxs.set_executor(exec->get_master());
-            overlap_send_col_idxs.set_executor(exec->get_master());
-            overlap_send_values.set_executor(exec->get_master());
-            overlap_recv_row_idxs.set_executor(exec->get_master());
-            overlap_recv_col_idxs.set_executor(exec->get_master());
-            overlap_recv_values.set_executor(exec->get_master());
+            send_row_idxs.set_executor(exec->get_master());
+            send_col_idxs.set_executor(exec->get_master());
+            send_values.set_executor(exec->get_master());
+            recv_row_idxs.set_executor(exec->get_master());
+            recv_col_idxs.set_executor(exec->get_master());
+            recv_values.set_executor(exec->get_master());
         }
-        comm.all_to_all_v(
-            use_host_buffer ? exec : exec->get_master(),
-            overlap_send_row_idxs.get_const_data(), overlap_send_sizes.data(),
-            overlap_send_offsets.data(), overlap_recv_row_idxs.get_data(),
-            overlap_recv_sizes.data(), overlap_recv_offsets.data());
-        comm.all_to_all_v(
-            use_host_buffer ? exec : exec->get_master(),
-            overlap_send_col_idxs.get_const_data(), overlap_send_sizes.data(),
-            overlap_send_offsets.data(), overlap_recv_col_idxs.get_data(),
-            overlap_recv_sizes.data(), overlap_recv_offsets.data());
-        comm.all_to_all_v(
-            use_host_buffer ? exec : exec->get_master(),
-            overlap_send_values.get_const_data(), overlap_send_sizes.data(),
-            overlap_send_offsets.data(), overlap_recv_values.get_data(),
-            overlap_recv_sizes.data(), overlap_recv_offsets.data());
+        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                          send_row_idxs.get_const_data(), send_sizes.get_data(),
+                          send_offsets.get_data(), recv_row_idxs.get_data(),
+                          recv_sizes.get_data(), recv_offsets.get_data());
+        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                          send_col_idxs.get_const_data(), send_sizes.get_data(),
+                          send_offsets.get_data(), recv_col_idxs.get_data(),
+                          recv_sizes.get_data(), recv_offsets.get_data());
+        comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(),
+                          send_values.get_const_data(), send_sizes.get_data(),
+                          send_offsets.get_data(), recv_values.get_data(),
+                          recv_sizes.get_data(), recv_offsets.get_data());
         if (use_host_buffer) {
-            overlap_recv_row_idxs.set_executor(exec);
-            overlap_recv_col_idxs.set_executor(exec);
-            overlap_recv_values.set_executor(exec);
+            recv_row_idxs.set_executor(exec);
+            recv_col_idxs.set_executor(exec);
+            recv_values.set_executor(exec);
         }
 
         array<global_index_type> all_row_idxs{exec, num_entries + n_recv};
@@ -342,18 +339,19 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<value_type> all_values{exec, num_entries + n_recv};
         exec->copy_from(exec, num_entries, data.get_const_row_idxs(),
                         all_row_idxs.get_data());
-        exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(),
+        exec->copy_from(exec, n_recv, recv_row_idxs.get_data(),
                         all_row_idxs.get_data() + num_entries);
         exec->copy_from(exec, num_entries, data.get_const_col_idxs(),
                         all_col_idxs.get_data());
-        exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(),
+        exec->copy_from(exec, n_recv, recv_col_idxs.get_data(),
                         all_col_idxs.get_data() + num_entries);
         exec->copy_from(exec, num_entries, data.get_const_values(),
                         all_values.get_data());
-        exec->copy_from(exec, n_recv, overlap_recv_values.get_data(),
+        exec->copy_from(exec, n_recv, recv_values.get_data(),
                         all_values.get_data() + num_entries);
         all_data = device_matrix_data<value_type, global_index_type>{
-            exec, global_dim, all_row_idxs, all_col_idxs, all_values};
+            exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs),
+            std::move(all_values)};
         all_data.sum_duplicates();
     }
 
@@ -370,9 +368,8 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     // as well as the rows of the non-local block. The columns of the non-local
     // block are still in global indices.
     exec->run(matrix::make_separate_local_nonlocal(
-        assembly_type == assembly::communicate ? all_data : data,
-        make_temporary_clone(exec, row_partition).get(),
-        make_temporary_clone(exec, col_partition).get(), local_part,
+        assembly_type == assembly_mode::communicate ? all_data : data,
+        tmp_row_partition.get(), tmp_col_partition.get(), local_part,
         local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
         global_non_local_col_idxs, non_local_values));
 
@@ -456,7 +453,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         row_partition,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         col_partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
@@ -470,7 +467,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const matrix_data<ValueType, global_index_type>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     return this->read_distributed(
         device_matrix_data<value_type, global_index_type>::create_from_host(
@@ -484,7 +481,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
     const device_matrix_data<ValueType, GlobalIndexType>& data,
     std::shared_ptr<const Partition<local_index_type, global_index_type>>
         partition,
-    assembly assembly_type)
+    assembly_mode assembly_type)
 {
     return this->read_distributed(data, partition, partition, assembly_type);
 }
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
index 4cdaf3e17fe..3ba02c27718 100644
--- a/core/distributed/matrix_kernels.hpp
+++ b/core/distributed/matrix_kernels.hpp
@@ -19,31 +19,30 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,       \
-                                          GlobalIndexType)                 \
-    void count_overlap_entries(                                            \
-        std::shared_ptr<const DefaultExecutor> exec,                       \
-        const device_matrix_data<ValueType, GlobalIndexType>& input,       \
-        const experimental::distributed::Partition<                        \
-            LocalIndexType, GlobalIndexType>* row_partition,               \
-        comm_index_type local_part, array<comm_index_type>& overlap_count, \
-        array<GlobalIndexType>& overlap_positions,                         \
+#define GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \
+                                             GlobalIndexType)           \
+    void count_non_owning_entries(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                    \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,    \
+        const experimental::distributed::Partition<                     \
+            LocalIndexType, GlobalIndexType>* row_partition,            \
+        comm_index_type local_part, array<comm_index_type>& send_count, \
+        array<GlobalIndexType>& send_positions,                         \
         array<GlobalIndexType>& original_positions)
 
 
-#define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \
-                                              GlobalIndexType)           \
-    void fill_overlap_send_buffers(                                      \
-        std::shared_ptr<const DefaultExecutor> exec,                     \
-        const device_matrix_data<ValueType, GlobalIndexType>& input,     \
-        const experimental::distributed::Partition<                      \
-            LocalIndexType, GlobalIndexType>* row_partition,             \
-        comm_index_type local_part,                                      \
-        const array<GlobalIndexType>& overlap_positions,                 \
-        const array<GlobalIndexType>& original_positions,                \
-        array<GlobalIndexType>& overlap_row_idxs,                        \
-        array<GlobalIndexType>& overlap_col_idxs,                        \
-        array<ValueType>& overlap_values)
+#define GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType,     \
+                                      GlobalIndexType)               \
+    void fill_send_buffers(                                          \
+        std::shared_ptr<const DefaultExecutor> exec,                 \
+        const device_matrix_data<ValueType, GlobalIndexType>& input, \
+        const experimental::distributed::Partition<                  \
+            LocalIndexType, GlobalIndexType>* row_partition,         \
+        comm_index_type local_part,                                  \
+        const array<GlobalIndexType>& send_positions,                \
+        const array<GlobalIndexType>& original_positions,            \
+        array<GlobalIndexType>& send_row_idxs,                       \
+        array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 
 
 #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,         \
@@ -62,19 +61,18 @@ namespace kernels {
         array<ValueType>& non_local_values)
 
 
-#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
-    using comm_index_type = experimental::distributed::comm_index_type; \
-    template <typename ValueType, typename LocalIndexType,              \
-              typename GlobalIndexType>                                 \
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType,        \
-                                      GlobalIndexType);                 \
-    template <typename ValueType, typename LocalIndexType,              \
-              typename GlobalIndexType>                                 \
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType,    \
-                                          GlobalIndexType);             \
-    template <typename ValueType, typename LocalIndexType,              \
-              typename GlobalIndexType>                                 \
-    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,      \
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                           \
+    using comm_index_type = experimental::distributed::comm_index_type;        \
+    template <typename ValueType, typename LocalIndexType,                     \
+              typename GlobalIndexType>                                        \
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType,            \
+                                         GlobalIndexType);                     \
+    template <typename ValueType, typename LocalIndexType,                     \
+              typename GlobalIndexType>                                        \
+    GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType); \
+    template <typename ValueType, typename LocalIndexType,                     \
+              typename GlobalIndexType>                                        \
+    GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType,             \
                                         GlobalIndexType)
 
 
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index 60fc0686473..e7aad7689a8 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -14,33 +14,33 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void non_owningverlap_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs,
-    array<ValueType>& overlap_values) GKO_NOT_IMPLEMENTED;
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs,
+    array<ValueType>& send_values) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
index 51e6e498348..3f1885bc8bf 100644
--- a/include/ginkgo/core/distributed/matrix.hpp
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -134,7 +134,7 @@ namespace distributed {
 
 
 /**
- * assembly defines how the read_distributed function of the distributed
+ * assembly_mode defines how the read_distributed function of the distributed
  * matrix treats non-local indices in the (device_)matrix_data:
  * - communicate communicates the overlap between ranks and adds up all local
  *   contributions. Indices smaller than 0 or larger than the global size
@@ -142,7 +142,7 @@ namespace distributed {
  * - local_only does not communicate any overlap but ignores all non-local
  *   indices.
  */
-enum class assembly { communicate, local_only };
+enum class assembly_mode { communicate, local_only };
 
 
 template <typename LocalIndexType, typename GlobalIndexType>
@@ -310,7 +310,7 @@ class Matrix
         const device_matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a square matrix from the matrix_data structure and a global
@@ -325,7 +325,7 @@ class Matrix
         const matrix_data<value_type, global_index_type>& data,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a matrix from the device_matrix_data structure, a global row
@@ -350,7 +350,7 @@ class Matrix
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Reads a matrix from the matrix_data structure, a global row partition,
@@ -367,7 +367,7 @@ class Matrix
             row_partition,
         std::shared_ptr<const Partition<local_index_type, global_index_type>>
             col_partition,
-        assembly assembly_type = assembly::local_only);
+        assembly_mode assembly_type = assembly_mode::local_only);
 
     /**
      * Get read access to the stored local matrix.
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
index 55ee5524116..a42b1f230fe 100644
--- a/omp/distributed/matrix_kernels.cpp
+++ b/omp/distributed/matrix_kernels.cpp
@@ -23,13 +23,13 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions)
 {
     auto num_input_elements = input.get_num_stored_elements();
@@ -46,7 +46,7 @@ void count_overlap_entries(
         row_part_ids_per_entry.get_data()[i] = row_part_id;
         if (row_part_id != local_part) {
 #pragma omp atomic
-            overlap_count.get_data()[row_part_id]++;
+            send_count.get_data()[row_part_id]++;
             original_positions.get_data()[i] = i;
         } else {
             original_positions.get_data()[i] = -1;
@@ -65,28 +65,28 @@ void count_overlap_entries(
 
 #pragma omp parallel for
     for (size_type i = 0; i < num_input_elements; i++) {
-        overlap_positions.get_data()[i] =
+        send_positions.get_data()[i] =
             original_positions.get_const_data()[i] == -1 ? 0 : 1;
     }
 
-    components::prefix_sum_nonnegative(exec, overlap_positions.get_data(),
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
                                        num_input_elements);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 {
     auto input_row_idxs = input.get_const_row_idxs();
     auto input_col_idxs = input.get_const_col_idxs();
@@ -96,16 +96,16 @@ void fill_overlap_send_buffers(
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
         auto in_pos = original_positions.get_const_data()[i];
         if (in_pos >= 0) {
-            auto out_pos = overlap_positions.get_const_data()[i];
-            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            overlap_values.get_data()[out_pos] = input_vals[in_pos];
+            auto out_pos = send_positions.get_const_data()[i];
+            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            send_values.get_data()[out_pos] = input_vals[in_pos];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 6a57a64e075..1be2660721b 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -20,13 +20,13 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void count_overlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, array<comm_index_type>& overlap_count,
-    array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
     array<GlobalIndexType>& original_positions)
 {
     auto num_input_elements = input.get_num_stored_elements();
@@ -41,7 +41,7 @@ void count_overlap_entries(
         auto row_part_id = row_part_ids[row_range_id];
         row_part_ids_per_entry.get_data()[i] = row_part_id;
         if (row_part_id != local_part) {
-            overlap_count.get_data()[row_part_id]++;
+            send_count.get_data()[row_part_id]++;
             original_positions.get_data()[i] = i;
         } else {
             original_positions.get_data()[i] = -1;
@@ -59,29 +59,29 @@ void count_overlap_entries(
     std::stable_sort(original_positions.get_data(),
                      original_positions.get_data() + num_input_elements, comp);
     for (size_type i = 0; i < num_input_elements; i++) {
-        overlap_positions.get_data()[i] =
+        send_positions.get_data()[i] =
             original_positions.get_const_data()[i] == -1 ? 0 : 1;
     }
 
-    std::exclusive_scan(overlap_positions.get_data(),
-                        overlap_positions.get_data() + num_input_elements,
-                        overlap_positions.get_data(), 0);
+    std::exclusive_scan(send_positions.get_data(),
+                        send_positions.get_data() + num_input_elements,
+                        send_positions.get_data(), 0);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_COUNT_OVERLAP_ENTRIES);
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void fill_overlap_send_buffers(
+void fill_send_buffers(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
         row_partition,
-    comm_index_type local_part, const array<GlobalIndexType>& overlap_positions,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
     const array<GlobalIndexType>& original_positions,
-    array<GlobalIndexType>& overlap_row_idxs,
-    array<GlobalIndexType>& overlap_col_idxs, array<ValueType>& overlap_values)
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
 {
     auto input_row_idxs = input.get_const_row_idxs();
     auto input_col_idxs = input.get_const_col_idxs();
@@ -90,16 +90,16 @@ void fill_overlap_send_buffers(
     for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
         auto in_pos = original_positions.get_const_data()[i];
         if (in_pos >= 0) {
-            auto out_pos = overlap_positions.get_const_data()[i];
-            overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
-            overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
-            overlap_values.get_data()[out_pos] = input_vals[in_pos];
+            auto out_pos = send_positions.get_const_data()[i];
+            send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos];
+            send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos];
+            send_values.get_data()[out_pos] = input_vals[in_pos];
         }
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
-    GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS);
+    GKO_DECLARE_FILL_SEND_BUFFERS);
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
index 80fc8eb3330..80063f7e582 100644
--- a/reference/test/distributed/matrix_kernels.cpp
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -190,15 +190,13 @@ TYPED_TEST(Matrix, CountOverlapEntries)
 {
     using lit = typename TestFixture::local_index_type;
     using git = typename TestFixture::global_index_type;
-    using vt = typename TestFixture::value_type;
     using ca = gko::array<comm_index_type>;
     using ga = gko::array<git>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ca> overlap_count_ref{
-        ca{this->ref, I<comm_index_type>{0, 5, 3}},
-        ca{this->ref, I<comm_index_type>{4, 0, 3}},
-        ca{this->ref, I<comm_index_type>{4, 5, 0}}};
-    std::vector<ga> overlap_pos_ref{
+    std::vector<ca> send_count_ref{ca{this->ref, I<comm_index_type>{0, 5, 3}},
+                                   ca{this->ref, I<comm_index_type>{4, 0, 3}},
+                                   ca{this->ref, I<comm_index_type>{4, 5, 0}}};
+    std::vector<ga> send_pos_ref{
         ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
@@ -212,16 +210,18 @@ TYPED_TEST(Matrix, CountOverlapEntries)
             this->ref, this->mapping, num_parts);
     auto input = this->create_input_full_rank();
 
-    ca overlap_count{this->ref, static_cast<gko::size_type>(num_parts)};
-    ga overlap_positions{this->ref, input.get_num_stored_elements()};
+    ca send_count{this->ref, static_cast<gko::size_type>(num_parts)};
+    ga send_positions{this->ref, input.get_num_stored_elements()};
     ga original_positions{this->ref, input.get_num_stored_elements()};
     for (gko::size_type i = 0; i < num_parts; i++) {
-        overlap_count.fill(0);
-        gko::kernels::reference::distributed_matrix::count_overlap_entries(
-            this->ref, input, partition.get(), i, overlap_count,
-            overlap_positions, original_positions);
-        GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(overlap_positions, overlap_pos_ref[i]);
+        send_count.fill(0);
+
+        gko::kernels::reference::distributed_matrix::count_non_owning_entries(
+            this->ref, input, partition.get(), i, send_count, send_positions,
+            original_positions);
+
+        GKO_ASSERT_ARRAY_EQ(send_count, send_count_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_positions, send_pos_ref[i]);
         GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]);
     }
 }
@@ -232,11 +232,10 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
     using lit = typename TestFixture::local_index_type;
     using git = typename TestFixture::global_index_type;
     using vt = typename TestFixture::value_type;
-    using ca = gko::array<comm_index_type>;
     using ga = gko::array<git>;
     using va = gko::array<vt>;
     this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}};
-    std::vector<ga> overlap_positions{
+    std::vector<ga> send_positions{
         ga{this->ref, I<git>{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}},
         ga{this->ref, I<git>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}};
@@ -244,15 +243,15 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
         ga{this->ref, I<git>{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}},
         ga{this->ref, I<git>{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}},
         ga{this->ref, I<git>{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}};
-    std::vector<ga> overlap_row_idxs_ref{
+    std::vector<ga> send_row_idxs_ref{
         ga{this->ref, I<git>{0, 0, 5, 5, 6, 2, 3, 3}},
         ga{this->ref, I<git>{1, 1, 4, 4, 2, 3, 3}},
         ga{this->ref, I<git>{1, 1, 4, 4, 0, 0, 5, 5, 6}}};
-    std::vector<ga> overlap_col_idxs_ref{
+    std::vector<ga> send_col_idxs_ref{
         ga{this->ref, I<git>{0, 3, 4, 5, 5, 2, 0, 3}},
         ga{this->ref, I<git>{1, 2, 4, 6, 2, 0, 3}},
         ga{this->ref, I<git>{1, 2, 4, 6, 0, 3, 4, 5, 5}}};
-    std::vector<va> overlap_values_ref{
+    std::vector<va> send_values_ref{
         va{this->ref, I<vt>{1, 2, 10, 11, 12, 5, 6, 7}},
         va{this->ref, I<vt>{3, 4, 8, 9, 5, 6, 7}},
         va{this->ref, I<vt>{3, 4, 8, 9, 1, 2, 10, 11, 12}}};
@@ -262,21 +261,22 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers)
             this->ref, this->mapping, num_parts);
     auto input = this->create_input_full_rank();
 
-    gko::array<git> overlap_row_idxs{this->ref};
-    gko::array<git> overlap_col_idxs{this->ref};
-    gko::array<vt> overlap_values{this->ref};
+    gko::array<git> send_row_idxs{this->ref};
+    gko::array<git> send_col_idxs{this->ref};
+    gko::array<vt> send_values{this->ref};
     for (gko::size_type i = 0; i < num_parts; i++) {
-        auto num_entries = overlap_row_idxs_ref[i].get_size();
-        overlap_row_idxs.resize_and_reset(num_entries);
-        overlap_col_idxs.resize_and_reset(num_entries);
-        overlap_values.resize_and_reset(num_entries);
-        gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers(
-            this->ref, input, partition.get(), i, overlap_positions[i],
-            original_positions[i], overlap_row_idxs, overlap_col_idxs,
-            overlap_values);
-        GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]);
-        GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]);
+        auto num_entries = send_row_idxs_ref[i].get_size();
+        send_row_idxs.resize_and_reset(num_entries);
+        send_col_idxs.resize_and_reset(num_entries);
+        send_values.resize_and_reset(num_entries);
+
+        gko::kernels::reference::distributed_matrix::fill_send_buffers(
+            this->ref, input, partition.get(), i, send_positions[i],
+            original_positions[i], send_row_idxs, send_col_idxs, send_values);
+
+        GKO_ASSERT_ARRAY_EQ(send_row_idxs, send_row_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_col_idxs, send_col_idxs_ref[i]);
+        GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]);
     }
 }
 
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
index 6de772e8006..2cef5a49f92 100644
--- a/test/distributed/matrix_kernels.cpp
+++ b/test/distributed/matrix_kernels.cpp
@@ -63,55 +63,50 @@ class Matrix : public CommonTestFixture {
             gko::array<local_index_type> d_non_local_row_idxs{exec};
             gko::array<global_index_type> d_non_local_col_idxs{exec};
             gko::array<value_type> d_non_local_values{exec};
-            gko::array<comm_index_type> overlap_count{ref, num_parts};
-            overlap_count.fill(0);
-            gko::array<comm_index_type> d_overlap_count{exec, num_parts};
-            d_overlap_count.fill(0);
-            gko::array<global_index_type> overlap_positions{ref, num_entries};
-            gko::array<global_index_type> d_overlap_positions{exec,
-                                                              num_entries};
+            gko::array<comm_index_type> send_count{ref, num_parts};
+            send_count.fill(0);
+            gko::array<comm_index_type> d_send_count{exec, num_parts};
+            d_send_count.fill(0);
+            gko::array<global_index_type> send_positions{ref, num_entries};
+            gko::array<global_index_type> d_send_positions{exec, num_entries};
             gko::array<global_index_type> original_positions{ref, num_entries};
             gko::array<global_index_type> d_original_positions{exec,
                                                                num_entries};
 
-            gko::kernels::reference::distributed_matrix::count_overlap_entries(
-                ref, input, row_partition.get(), part, overlap_count,
-                overlap_positions, original_positions);
-            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
-                count_overlap_entries(
-                    exec, d_input, d_row_partition.get(), part, d_overlap_count,
-                    d_overlap_positions, d_original_positions);
-
-            gko::array<global_index_type> overlap_offsets{ref, num_parts + 1};
-            std::partial_sum(overlap_count.get_data(),
-                             overlap_count.get_data() + num_parts,
-                             overlap_offsets.get_data() + 1);
-            overlap_offsets.get_data()[0] = 0;
-            gko::array<global_index_type> d_overlap_offsets{exec,
-                                                            overlap_offsets};
-            gko::size_type num_overlap_entries =
-                overlap_offsets.get_data()[num_parts];
-            gko::array<global_index_type> overlap_row_idxs{ref,
-                                                           num_overlap_entries};
-            gko::array<global_index_type> overlap_col_idxs{ref,
-                                                           num_overlap_entries};
-            gko::array<value_type> overlap_values{ref, num_overlap_entries};
-            gko::array<global_index_type> d_overlap_row_idxs{
-                exec, num_overlap_entries};
-            gko::array<global_index_type> d_overlap_col_idxs{
-                exec, num_overlap_entries};
-            gko::array<value_type> d_overlap_values{exec, num_overlap_entries};
-
             gko::kernels::reference::distributed_matrix::
-                fill_overlap_send_buffers(ref, input, row_partition.get(), part,
-                                          overlap_positions, original_positions,
-                                          overlap_row_idxs, overlap_col_idxs,
-                                          overlap_values);
+                count_non_owning_entries(ref, input, row_partition.get(), part,
+                                         send_count, send_positions,
+                                         original_positions);
+            gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
+                count_non_owning_entries(exec, d_input, d_row_partition.get(),
+                                         part, d_send_count, d_send_positions,
+                                         d_original_positions);
+
+            gko::array<global_index_type> send_offsets{ref, num_parts + 1};
+            std::partial_sum(send_count.get_data(),
+                             send_count.get_data() + num_parts,
+                             send_offsets.get_data() + 1);
+            send_offsets.get_data()[0] = 0;
+            gko::array<global_index_type> d_send_offsets{exec, send_offsets};
+            gko::size_type num_send_entries =
+                send_offsets.get_data()[num_parts];
+            gko::array<global_index_type> send_row_idxs{ref, num_send_entries};
+            gko::array<global_index_type> send_col_idxs{ref, num_send_entries};
+            gko::array<value_type> send_values{ref, num_send_entries};
+            gko::array<global_index_type> d_send_row_idxs{exec,
+                                                          num_send_entries};
+            gko::array<global_index_type> d_send_col_idxs{exec,
+                                                          num_send_entries};
+            gko::array<value_type> d_send_values{exec, num_send_entries};
+
+            gko::kernels::reference::distributed_matrix::fill_send_buffers(
+                ref, input, row_partition.get(), part, send_positions,
+                original_positions, send_row_idxs, send_col_idxs, send_values);
             gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix::
-                fill_overlap_send_buffers(
-                    exec, d_input, d_row_partition.get(), part,
-                    d_overlap_positions, d_original_positions,
-                    d_overlap_row_idxs, d_overlap_col_idxs, d_overlap_values);
+                fill_send_buffers(exec, d_input, d_row_partition.get(), part,
+                                  d_send_positions, d_original_positions,
+                                  d_send_row_idxs, d_send_col_idxs,
+                                  d_send_values);
 
             gko::kernels::reference::distributed_matrix::
                 separate_local_nonlocal(
@@ -125,12 +120,12 @@ class Matrix : public CommonTestFixture {
                     d_non_local_row_idxs, d_non_local_col_idxs,
                     d_non_local_values);
 
-            GKO_ASSERT_ARRAY_EQ(overlap_positions, d_overlap_positions);
+            GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions);
             GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions);
-            GKO_ASSERT_ARRAY_EQ(overlap_count, d_overlap_count);
-            GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, d_overlap_row_idxs);
-            GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, d_overlap_col_idxs);
-            GKO_ASSERT_ARRAY_EQ(overlap_values, d_overlap_values);
+            GKO_ASSERT_ARRAY_EQ(send_count, d_send_count);
+            GKO_ASSERT_ARRAY_EQ(send_row_idxs, d_send_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(send_col_idxs, d_send_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(send_values, d_send_values);
             GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs);
             GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs);
             GKO_ASSERT_ARRAY_EQ(local_values, d_local_values);
diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp
index 1b25ad7eb6d..0cfb3aca477 100644
--- a/test/mpi/matrix.cpp
+++ b/test/mpi/matrix.cpp
@@ -152,7 +152,7 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate)
 
     this->dist_mat->read_distributed(
         this->dist_input[rank], this->row_part,
-        gko::experimental::distributed::assembly::communicate);
+        gko::experimental::distributed::assembly_mode::communicate);
 
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
                         res_local[rank], 0);
@@ -191,7 +191,7 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate)
 
     this->dist_mat->read_distributed(
         this->dist_input[rank], this->row_part, this->col_part,
-        gko::experimental::distributed::assembly::communicate);
+        gko::experimental::distributed::assembly_mode::communicate);
 
     GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
                         res_local[rank], 0);

From fda55dac669a1d6bb261a04663314af293bae01c Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Mon, 21 Oct 2024 12:43:30 +0200
Subject: [PATCH 6/8] Small fixes

---
 core/distributed/matrix.cpp              | 2 +-
 dpcpp/distributed/matrix_kernels.dp.cpp  | 2 +-
 reference/distributed/matrix_kernels.cpp | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index 2c174f26806..acd1b415ef5 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -275,7 +275,7 @@ void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
         array<comm_index_type> send_sizes{exec, num_parts};
         array<global_index_type> send_positions{exec, num_entries};
         array<global_index_type> original_positions{exec, num_entries};
-        send_sizes.fill(0);
+        send_sizes.fill(zero<comm_index_type>());
         exec->run(matrix::make_count_non_owning_entries(
             data, tmp_row_partition.get(), local_part, send_sizes,
             send_positions, original_positions));
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
index e7aad7689a8..8582fab9378 100644
--- a/dpcpp/distributed/matrix_kernels.dp.cpp
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -14,7 +14,7 @@ namespace distributed_matrix {
 
 
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
-void non_owningverlap_entries(
+void count_non_owning_entries(
     std::shared_ptr<const DefaultExecutor> exec,
     const device_matrix_data<ValueType, GlobalIndexType>& input,
     const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
index 1be2660721b..915ec109657 100644
--- a/reference/distributed/matrix_kernels.cpp
+++ b/reference/distributed/matrix_kernels.cpp
@@ -5,11 +5,11 @@
 #include "core/distributed/matrix_kernels.hpp"
 
 #include <algorithm>
-#include <numeric>
 
 #include "core/base/allocator.hpp"
 #include "core/base/device_matrix_data_kernels.hpp"
 #include "core/base/iterator_factory.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 #include "reference/distributed/partition_helpers.hpp"
 
 
@@ -63,9 +63,8 @@ void count_non_owning_entries(
             original_positions.get_const_data()[i] == -1 ? 0 : 1;
     }
 
-    std::exclusive_scan(send_positions.get_data(),
-                        send_positions.get_data() + num_input_elements,
-                        send_positions.get_data(), 0);
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
+                                       num_input_elements);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(

From a7900d5f8ad60b7b114a0f66ff009a625c0ca784 Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Mon, 21 Oct 2024 14:00:26 +0200
Subject: [PATCH 7/8] Add missing include

---
 core/distributed/matrix.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
index acd1b415ef5..dc04e9a9545 100644
--- a/core/distributed/matrix.cpp
+++ b/core/distributed/matrix.cpp
@@ -4,6 +4,7 @@
 
 #include "ginkgo/core/distributed/matrix.hpp"
 
+#include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/coo.hpp>

From 1efda8fc6a383ff97be6f73917d3e93401c67c6c Mon Sep 17 00:00:00 2001
From: Fritz Goebel <fritz.goebel@kit.edu>
Date: Tue, 22 Oct 2024 10:49:55 +0200
Subject: [PATCH 8/8] Fix circular dependency with array.fill

---
 common/cuda_hip/distributed/matrix_kernels.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
index 159ddbce296..a1a75a5af63 100644
--- a/common/cuda_hip/distributed/matrix_kernels.cpp
+++ b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -21,6 +21,7 @@
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
 #include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
 #include "core/components/format_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 
@@ -106,7 +107,9 @@ void count_non_owning_entries(
                                        num_input_elements);
     size_type num_parts = row_partition->get_num_parts();
     array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
-    row_part_ptrs.fill(0);
+    components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1,
+                           zero<comm_index_type>());
+
     components::convert_idxs_to_ptrs(
         exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
         num_parts, row_part_ptrs.get_data());