From e9d4f1ffb5efd5783933efea4cff0eceeac05ce5 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Thu, 11 Jul 2024 16:41:43 +0000 Subject: [PATCH 1/8] Add overlap communication to read_distributed --- .../cuda_hip/distributed/matrix_kernels.cpp | 28 ++++++ core/device_hooks/common_kernels.inc.cpp | 3 + core/distributed/matrix.cpp | 97 +++++++++++++++++-- core/distributed/matrix_kernels.hpp | 31 ++++++ dpcpp/distributed/matrix_kernels.dp.cpp | 28 ++++++ include/ginkgo/core/distributed/matrix.hpp | 24 ++++- omp/distributed/matrix_kernels.cpp | 28 ++++++ reference/distributed/matrix_kernels.cpp | 62 ++++++++++++ 8 files changed, 289 insertions(+), 12 deletions(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index 88988febbb0..70824bf03d9 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -49,6 +49,34 @@ struct input_type { }; +template +void count_overlap_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, + array& overlap_count) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + + +template +void fill_overlap_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& offsets, + array& overlap_row_idxs, + array& overlap_col_idxs, + array& overlap_values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 1ba925e94e3..304bed16683 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -274,6 +274,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( namespace distributed_matrix { +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_COUNT_OVERLAP_ENTRIES); +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 63f359cc40a..e4bae04d180 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -4,6 +4,9 @@ #include "ginkgo/core/distributed/matrix.hpp" +#include +#include + #include #include #include @@ -20,6 +23,10 @@ namespace matrix { namespace { +GKO_REGISTER_OPERATION(count_overlap_entries, + distributed_matrix::count_overlap_entries); +GKO_REGISTER_OPERATION(fill_overlap_send_buffers, + distributed_matrix::fill_overlap_send_buffers); GKO_REGISTER_OPERATION(separate_local_nonlocal, distributed_matrix::separate_local_nonlocal); @@ -243,7 +250,8 @@ void Matrix::read_distributed( std::shared_ptr> row_partition, std::shared_ptr> - col_partition) + col_partition, + assembly assembly_type) { const auto comm = this->get_communicator(); GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size()); @@ -252,6 +260,7 @@ void Matrix::read_distributed( GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts()); auto exec = this->get_executor(); auto local_part = comm.rank(); + auto use_host_buffer = mpi::requires_host_buffer(exec, comm); // set up LinOp sizes auto num_parts = static_cast(row_partition->get_num_parts()); @@ -260,6 +269,76 @@ void Matrix::read_distributed( dim<2> global_dim{global_num_rows, global_num_cols}; this->set_size(global_dim); + if (assembly_type == assembly::communicate) { + array overlap_count{exec, num_parts}; + overlap_count.fill(0); + auto tmp_part = make_temporary_clone(exec, row_partition); + exec->run(matrix::make_count_overlap_entries( + data, tmp_part.get(), local_part, overlap_count)); + + overlap_count.set_executor(exec->get_master()); + std::vector overlap_send_sizes( + overlap_count.get_data(), overlap_count.get_data() + num_parts); + std::vector overlap_send_offsets(num_parts + 1); + std::vector overlap_recv_sizes(num_parts); + std::vector overlap_recv_offsets(num_parts + 1); + + std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(), + overlap_send_offsets.begin() + 1); + comm.all_to_all(exec, overlap_send_sizes.data(), 1, + overlap_recv_sizes.data(), 1); + std::partial_sum(overlap_recv_sizes.begin(), overlap_recv_sizes.end(), + overlap_recv_offsets.begin() + 1); + overlap_send_offsets[0] = 0; + overlap_recv_offsets[0] = 0; + + size_type n_send = overlap_send_offsets.back(); + size_type n_recv = overlap_recv_offsets.back(); + array overlap_send_row_idxs{exec, n_send}; + array overlap_send_col_idxs{exec, n_send}; + array overlap_send_values{exec, n_send}; + array overlap_recv_row_idxs{exec, n_recv}; + array overlap_recv_col_idxs{exec, n_recv}; + array overlap_recv_values{exec, n_recv}; + auto offset_array = + make_const_array_view(exec->get_master(), num_parts + 1, + overlap_send_offsets.data()) + .copy_to_array(); + offset_array.set_executor(exec); + exec->run(matrix::make_fill_overlap_send_buffers( + data, tmp_part.get(), local_part, offset_array, + overlap_send_row_idxs, overlap_send_col_idxs, overlap_send_values)); + + if (use_host_buffer) { + overlap_send_row_idxs.set_executor(exec->get_master()); + overlap_send_col_idxs.set_executor(exec->get_master()); + overlap_send_values.set_executor(exec->get_master()); + overlap_recv_row_idxs.set_executor(exec->get_master()); + overlap_recv_col_idxs.set_executor(exec->get_master()); + overlap_recv_values.set_executor(exec->get_master()); + } + comm.all_to_all_v( + use_host_buffer ? exec : exec->get_master(), + overlap_send_row_idxs.get_const_data(), overlap_send_sizes.data(), + overlap_send_offsets.data(), overlap_recv_row_idxs.get_data(), + overlap_recv_sizes.data(), overlap_recv_offsets.data()); + comm.all_to_all_v( + use_host_buffer ? exec : exec->get_master(), + overlap_send_col_idxs.get_const_data(), overlap_send_sizes.data(), + overlap_send_offsets.data(), overlap_recv_col_idxs.get_data(), + overlap_recv_sizes.data(), overlap_recv_offsets.data()); + comm.all_to_all_v( + use_host_buffer ? exec : exec->get_master(), + overlap_send_values.get_const_data(), overlap_send_sizes.data(), + overlap_send_offsets.data(), overlap_recv_values.get_data(), + overlap_recv_sizes.data(), overlap_recv_offsets.data()); + if (use_host_buffer) { + overlap_recv_row_idxs.set_executor(exec); + overlap_recv_col_idxs.set_executor(exec); + overlap_recv_values.set_executor(exec); + } + } + // temporary storage for the output array local_row_idxs{exec}; array local_col_idxs{exec}; @@ -335,7 +414,6 @@ void Matrix::read_distributed( imap.get_executor(), imap.get_non_local_size(), imap.get_remote_local_idxs().get_const_flat_data()) .copy_to_array(); - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); if (use_host_buffer) { recv_gather_idxs.set_executor(exec->get_master()); gather_idxs_.clear(); @@ -358,12 +436,13 @@ void Matrix::read_distributed( std::shared_ptr> row_partition, std::shared_ptr> - col_partition) + col_partition, + assembly assembly_type) { return this->read_distributed( device_matrix_data::create_from_host( this->get_executor(), data), - row_partition, col_partition); + row_partition, col_partition, assembly_type); } @@ -371,12 +450,13 @@ template void Matrix::read_distributed( const matrix_data& data, std::shared_ptr> - partition) + partition, + assembly assembly_type) { return this->read_distributed( device_matrix_data::create_from_host( this->get_executor(), data), - partition, partition); + partition, partition, assembly_type); } @@ -384,9 +464,10 @@ template void Matrix::read_distributed( const device_matrix_data& data, std::shared_ptr> - partition) + partition, + assembly assembly_type) { - return this->read_distributed(data, partition, partition); + return this->read_distributed(data, partition, partition, assembly_type); } diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index f24e8c9945e..fa7e891eb4e 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -19,6 +19,29 @@ namespace gko { namespace kernels { +#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void count_overlap_entries( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, array& overlap_count) + + +#define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void fill_overlap_send_buffers( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, array& offsets, \ + array& overlap_row_idxs, \ + array& overlap_col_idxs, \ + array& overlap_values) + + #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ GlobalIndexType) \ void separate_local_nonlocal( \ @@ -37,6 +60,14 @@ namespace kernels { #define GKO_DECLARE_ALL_AS_TEMPLATES \ using comm_index_type = experimental::distributed::comm_index_type; \ + template \ + GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType); \ + template \ + GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \ + GlobalIndexType); \ template \ GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index 47adaaeca59..9225e58ad14 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -13,6 +13,34 @@ namespace dpcpp { namespace distributed_matrix { +template +void count_overlap_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, + array& overlap_count) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + + +template +void fill_overlap_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& offsets, + array& overlap_row_idxs, + array& overlap_col_idxs, + array& overlap_values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index de719bb9315..51e6e498348 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -133,6 +133,18 @@ namespace experimental { namespace distributed { +/** + * assembly defines how the read_distributed function of the distributed + * matrix treats non-local indices in the (device_)matrix_data: + * - communicate communicates the overlap between ranks and adds up all local + * contributions. Indices smaller than 0 or larger than the global size + * of the matrix are ignored. + * - local_only does not communicate any overlap but ignores all non-local + * indices. + */ +enum class assembly { communicate, local_only }; + + template class Partition; template @@ -297,7 +309,8 @@ class Matrix void read_distributed( const device_matrix_data& data, std::shared_ptr> - partition); + partition, + assembly assembly_type = assembly::local_only); /** * Reads a square matrix from the matrix_data structure and a global @@ -311,7 +324,8 @@ class Matrix void read_distributed( const matrix_data& data, std::shared_ptr> - partition); + partition, + assembly assembly_type = assembly::local_only); /** * Reads a matrix from the device_matrix_data structure, a global row @@ -335,7 +349,8 @@ class Matrix std::shared_ptr> row_partition, std::shared_ptr> - col_partition); + col_partition, + assembly assembly_type = assembly::local_only); /** * Reads a matrix from the matrix_data structure, a global row partition, @@ -351,7 +366,8 @@ class Matrix std::shared_ptr> row_partition, std::shared_ptr> - col_partition); + col_partition, + assembly assembly_type = assembly::local_only); /** * Get read access to the stored local matrix. diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index 2f36ec4a778..a3e8cb60868 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -20,6 +20,34 @@ namespace omp { namespace distributed_matrix { +template +void count_overlap_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, + array& overlap_count) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + + +template +void fill_overlap_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& offsets, + array& overlap_row_idxs, + array& overlap_col_idxs, + array& overlap_values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index 95176b34656..d8b0f9e1d4f 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -7,6 +7,7 @@ #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" +#include "ginkgo/core/distributed/partition.hpp" #include "reference/distributed/partition_helpers.hpp" @@ -16,6 +17,67 @@ namespace reference { namespace distributed_matrix { +template +void count_overlap_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& overlap_count) +{ + auto input_row_idxs = input.get_const_row_idxs(); + auto row_part_ids = row_partition->get_part_ids(); + + size_type row_range_id = 0; + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto global_row = input_row_idxs[i]; + row_range_id = find_range(global_row, row_partition, row_range_id); + row_range_id = find_range(global_row, row_partition, row_range_id); + auto row_part_id = row_part_ids[row_range_id]; + if (row_part_id != local_part) { + overlap_count.get_data()[row_part_id]++; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + + +template +void fill_overlap_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& offsets, + array& overlap_row_idxs, + array& overlap_col_idxs, array& overlap_values) +{ + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + auto input_vals = input.get_const_values(); + auto row_part_ids = row_partition->get_part_ids(); + + size_type row_range_id = 0; + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto global_row = input_row_idxs[i]; + row_range_id = find_range(global_row, row_partition, row_range_id); + row_range_id = find_range(global_row, row_partition, row_range_id); + auto row_part_id = row_part_ids[row_range_id]; + if (row_part_id != local_part) { + auto idx = offsets.get_data()[row_part_id]++; + overlap_row_idxs.get_data()[idx] = global_row; + overlap_col_idxs.get_data()[idx] = input_col_idxs[i]; + overlap_values.get_data()[idx] = input_vals[i]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, From 20f01886f1bab04fc13d970972b172e326a7b257 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Tue, 16 Jul 2024 12:14:20 +0000 Subject: [PATCH 2/8] test communicating read_distributed --- core/distributed/matrix.cpp | 38 ++++++++++++++++++++------ test/mpi/matrix.cpp | 53 ++++++++++++++++++++++++++++++++++--- 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index e4bae04d180..a64a07619b3 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -14,6 +14,7 @@ #include #include "core/distributed/matrix_kernels.hpp" +#include "ginkgo/core/base/mtx_io.hpp" namespace gko { @@ -263,14 +264,14 @@ void Matrix::read_distributed( auto use_host_buffer = mpi::requires_host_buffer(exec, comm); // set up LinOp sizes - auto num_parts = static_cast(row_partition->get_num_parts()); auto global_num_rows = row_partition->get_size(); auto global_num_cols = col_partition->get_size(); dim<2> global_dim{global_num_rows, global_num_cols}; this->set_size(global_dim); + device_matrix_data all_data{exec}; if (assembly_type == assembly::communicate) { - array overlap_count{exec, num_parts}; + array overlap_count{exec, comm.size()}; overlap_count.fill(0); auto tmp_part = make_temporary_clone(exec, row_partition); exec->run(matrix::make_count_overlap_entries( @@ -278,10 +279,10 @@ void Matrix::read_distributed( overlap_count.set_executor(exec->get_master()); std::vector overlap_send_sizes( - overlap_count.get_data(), overlap_count.get_data() + num_parts); - std::vector overlap_send_offsets(num_parts + 1); - std::vector overlap_recv_sizes(num_parts); - std::vector overlap_recv_offsets(num_parts + 1); + overlap_count.get_data(), overlap_count.get_data() + comm.size()); + std::vector overlap_send_offsets(comm.size() + 1); + std::vector overlap_recv_sizes(comm.size()); + std::vector overlap_recv_offsets(comm.size() + 1); std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(), overlap_send_offsets.begin() + 1); @@ -301,7 +302,7 @@ void Matrix::read_distributed( array overlap_recv_col_idxs{exec, n_recv}; array overlap_recv_values{exec, n_recv}; auto offset_array = - make_const_array_view(exec->get_master(), num_parts + 1, + make_const_array_view(exec->get_master(), comm.size() + 1, overlap_send_offsets.data()) .copy_to_array(); offset_array.set_executor(exec); @@ -337,6 +338,26 @@ void Matrix::read_distributed( overlap_recv_col_idxs.set_executor(exec); overlap_recv_values.set_executor(exec); } + + size_type n_nnz = data.get_num_stored_elements(); + array all_row_idxs{exec, n_nnz + n_recv}; + array all_col_idxs{exec, n_nnz + n_recv}; + array all_values{exec, n_nnz + n_recv}; + exec->copy_from(exec, n_nnz, data.get_const_row_idxs(), + all_row_idxs.get_data()); + exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(), + all_row_idxs.get_data() + n_nnz); + exec->copy_from(exec, n_nnz, data.get_const_col_idxs(), + all_col_idxs.get_data()); + exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(), + all_col_idxs.get_data() + n_nnz); + exec->copy_from(exec, n_nnz, data.get_const_values(), + all_values.get_data()); + exec->copy_from(exec, n_recv, overlap_recv_values.get_data(), + all_values.get_data() + n_nnz); + all_data = device_matrix_data{ + exec, global_dim, all_row_idxs, all_col_idxs, all_values}; + all_data.sum_duplicates(); } // temporary storage for the output @@ -352,7 +373,8 @@ void Matrix::read_distributed( // as well as the rows of the non-local block. The columns of the non-local // block are still in global indices. exec->run(matrix::make_separate_local_nonlocal( - data, make_temporary_clone(exec, row_partition).get(), + assembly_type == assembly::communicate ? all_data : data, + make_temporary_clone(exec, row_partition).get(), make_temporary_clone(exec, col_partition).get(), local_part, local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, global_non_local_col_idxs, non_local_values)); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index f4b8af2fb19..1b25ad7eb6d 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -60,9 +60,16 @@ class MatrixCreation : public CommonMpiTestFixture { {3, 4, 7}, {4, 0, 9}, {4, 4, 10}}}, - dist_input{{{size, {{0, 1, 1}, {0, 3, 2}, {1, 1, 3}, {1, 2, 4}}}, - {size, {{2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}}, - {size, {{4, 0, 9}, {4, 4, 10}}}}}, + dist_input{ + {{size, + {{0, 1, 1}, + {0, 3, 2}, + {1, 1, 3}, + {1, 2, 4}, + {2, 0, 1}, + {2, 3, 1}}}, + {size, {{0, 0, 1}, {2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}}, + {size, {{2, 2, 1}, {3, 3, -1}, {4, 0, 9}, {4, 4, 10}}}}}, engine(42) { row_part = Partition::build_from_contiguous( @@ -134,6 +141,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalData) } +TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate) +{ + using value_type = typename TestFixture::value_type; + using csr = typename TestFixture::local_matrix_type; + I> res_local[] = {{{1, 1}, {0, 3}}, {{7, 1}, {0, 7}}, {{10}}}; + I> res_non_local[] = { + {{0, 2}, {4, 0}}, {{1, 5, 0}, {0, 0, 7}}, {{9}}}; + auto rank = this->dist_mat->get_communicator().rank(); + + this->dist_mat->read_distributed( + this->dist_input[rank], this->row_part, + gko::experimental::distributed::assembly::communicate); + + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_non_local[rank], 0); +} + + TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition) { using value_type = typename TestFixture::value_type; @@ -153,6 +180,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition) } +TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate) +{ + using value_type = typename TestFixture::value_type; + using csr = typename TestFixture::local_matrix_type; + I> res_local[] = {{{2, 0}, {0, 0}}, {{1, 5}, {0, 0}}, {{0}}}; + I> res_non_local[] = { + {{1, 1, 0}, {0, 3, 4}}, {{1, 0, 7}, {7, 7, 0}}, {{10, 9}}}; + auto rank = this->dist_mat->get_communicator().rank(); + + this->dist_mat->read_distributed( + this->dist_input[rank], this->row_part, this->col_part, + gko::experimental::distributed::assembly::communicate); + + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_non_local[rank], 0); +} + + TYPED_TEST(MatrixCreation, BuildOnlyLocal) { using value_type = typename TestFixture::value_type; From e193811ea4c3b3552243f4753a3d410200f5454f Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Wed, 17 Jul 2024 12:28:11 +0000 Subject: [PATCH 3/8] Add reference kernel tests --- reference/test/distributed/matrix_kernels.cpp | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index a34844cbde9..00d3fcd8895 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -17,6 +17,8 @@ #include #include "core/test/utils.hpp" +#include "ginkgo/core/base/array.hpp" +#include "ginkgo/core/base/types.hpp" namespace { @@ -186,6 +188,85 @@ TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); +TYPED_TEST(Matrix, CountOverlapEntries) +{ + using lit = typename TestFixture::local_index_type; + using git = typename TestFixture::global_index_type; + using vt = typename TestFixture::value_type; + using ca = gko::array; + this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; + std::vector overlap_count_ref{ + ca{this->ref, I{0, 5, 3}}, + ca{this->ref, I{4, 0, 3}}, + ca{this->ref, I{4, 5, 0}}}; + comm_index_type num_parts = 3; + auto partition = + gko::experimental::distributed::Partition::build_from_mapping( + this->ref, this->mapping, num_parts); + auto input = this->create_input_full_rank(); + + gko::array overlap_count{ + this->ref, static_cast(num_parts)}; + for (gko::size_type i = 0; i < num_parts; i++) { + overlap_count.fill(0); + gko::kernels::reference::distributed_matrix::count_overlap_entries( + this->ref, input, partition.get(), i, overlap_count); + GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]); + } +} + + +TYPED_TEST(Matrix, FillOverlapSendBuffers) +{ + using lit = typename TestFixture::local_index_type; + using git = typename TestFixture::global_index_type; + using vt = typename TestFixture::value_type; + using ca = gko::array; + using ga = gko::array; + using va = gko::array; + this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; + std::vector overlap_offsets{ + ca{this->ref, I{0, 0, 5, 8}}, + ca{this->ref, I{0, 4, 4, 7}}, + ca{this->ref, I{0, 4, 9, 9}}}; + std::vector overlap_row_idxs_ref{ + ga{this->ref, I{0, 0, 5, 5, 6, 2, 3, 3}}, + ga{this->ref, I{1, 1, 4, 4, 2, 3, 3}}, + ga{this->ref, I{1, 1, 4, 4, 0, 0, 5, 5, 6}}}; + std::vector overlap_col_idxs_ref{ + ga{this->ref, I{0, 3, 4, 5, 5, 2, 0, 3}}, + ga{this->ref, I{1, 2, 4, 6, 2, 0, 3}}, + ga{this->ref, I{1, 2, 4, 6, 0, 3, 4, 5, 5}}}; + std::vector overlap_values_ref{ + va{this->ref, I{1, 2, 10, 11, 12, 5, 6, 7}}, + va{this->ref, I{3, 4, 8, 9, 5, 6, 7}}, + va{this->ref, I{3, 4, 8, 9, 1, 2, 10, 11, 12}}}; + comm_index_type num_parts = 3; + auto partition = + gko::experimental::distributed::Partition::build_from_mapping( + this->ref, this->mapping, num_parts); + auto input = this->create_input_full_rank(); + + gko::array overlap_row_idxs{this->ref}; + gko::array overlap_col_idxs{this->ref}; + gko::array overlap_values{this->ref}; + for (gko::size_type i = 0; i < num_parts; i++) { + overlap_row_idxs.resize_and_reset( + overlap_offsets[i].get_data()[num_parts]); + overlap_col_idxs.resize_and_reset( + overlap_offsets[i].get_data()[num_parts]); + overlap_values.resize_and_reset( + overlap_offsets[i].get_data()[num_parts]); + gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers( + this->ref, input, partition.get(), i, overlap_offsets[i], + overlap_row_idxs, overlap_col_idxs, overlap_values); + GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]); + GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]); + GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]); + } +} + + TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty) { using lit = typename TestFixture::local_index_type; From 769066a0ba729c5b3a5bf509f7aee9fb7412b39c Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Fri, 19 Jul 2024 12:20:14 +0000 Subject: [PATCH 4/8] Add device kernels and tests --- .../cuda_hip/distributed/matrix_kernels.cpp | 94 ++++++++++++++++++- core/distributed/matrix.cpp | 51 +++++----- core/distributed/matrix_kernels.hpp | 22 +++-- dpcpp/distributed/matrix_kernels.dp.cpp | 8 +- omp/distributed/matrix_kernels.cpp | 70 +++++++++++++- reference/distributed/matrix_kernels.cpp | 54 ++++++++--- reference/test/distributed/matrix_kernels.cpp | 48 ++++++---- test/distributed/matrix_kernels.cpp | 60 +++++++++++- 8 files changed, 324 insertions(+), 83 deletions(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index 70824bf03d9..ab3ec9da8b1 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -20,6 +20,9 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/atomic.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -55,8 +58,67 @@ void count_overlap_entries( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, - array& overlap_count) GKO_NOT_IMPLEMENTED; + comm_index_type local_part, array& overlap_count, + array& overlap_positions, + array& original_positions) +{ + auto row_part_ids = row_partition->get_part_ids(); + const auto* row_range_bounds = row_partition->get_range_bounds(); + const auto* row_range_starting_indices = + row_partition->get_range_starting_indices(); + const auto num_row_ranges = row_partition->get_num_ranges(); + const auto num_input_elements = input.get_num_stored_elements(); + + auto policy = thrust_policy(exec); + + // precompute the row and column range id of each input element + auto input_row_idxs = input.get_const_row_idxs(); + array row_range_ids{exec, num_input_elements}; + thrust::upper_bound(policy, row_range_bounds + 1, + row_range_bounds + num_row_ranges + 1, input_row_idxs, + input_row_idxs + num_input_elements, + row_range_ids.get_data()); + + array row_part_ids_per_entry{exec, num_input_elements}; + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids, + auto part_ids_per_entry, auto orig_positions) { + part_ids_per_entry[i] = part_ids[range_ids[i]]; + orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i; + }, + num_input_elements, local_part, row_part_ids, row_range_ids.get_data(), + row_part_ids_per_entry.get_data(), original_positions.get_data()); + + thrust::stable_sort_by_key( + policy, row_part_ids_per_entry.get_data(), + row_part_ids_per_entry.get_data() + num_input_elements, + original_positions.get_data()); + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto orig_positions, auto overl_positions) { + overl_positions[i] = orig_positions[i] >= 0 ? 1 : 0; + }, + num_input_elements, original_positions.get_const_data(), + overlap_positions.get_data()); + + components::prefix_sum_nonnegative(exec, overlap_positions.get_data(), + num_input_elements); + size_type num_parts = row_partition->get_num_parts(); + array row_part_ptrs{exec, num_parts + 1}; + row_part_ptrs.fill(0); + components::convert_idxs_to_ptrs( + exec, row_part_ids_per_entry.get_const_data(), num_input_elements, + num_parts, row_part_ptrs.get_data()); + + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) { + count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i]; + }, + num_parts, local_part, row_part_ptrs.get_data(), + overlap_count.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_OVERLAP_ENTRIES); @@ -68,10 +130,32 @@ void fill_overlap_send_buffers( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& offsets, + comm_index_type local_part, const array& overlap_positions, + const array& original_positions, array& overlap_row_idxs, - array& overlap_col_idxs, - array& overlap_values) GKO_NOT_IMPLEMENTED; + array& overlap_col_idxs, array& overlap_values) +{ + auto num_entries = input.get_num_stored_elements(); + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + auto input_values = input.get_const_values(); + + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals, + auto in_pos, auto out_pos, auto out_rows, auto out_cols, + auto out_vals) { + if (in_pos[i] >= 0) { + out_rows[out_pos[i]] = in_rows[in_pos[i]]; + out_cols[out_pos[i]] = in_cols[in_pos[i]]; + out_vals[out_pos[i]] = in_vals[in_pos[i]]; + } + }, + num_entries, input_row_idxs, input_col_idxs, input_values, + original_positions.get_const_data(), overlap_positions.get_const_data(), + overlap_row_idxs.get_data(), overlap_col_idxs.get_data(), + overlap_values.get_data()); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index a64a07619b3..3799895abf3 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -4,17 +4,14 @@ #include "ginkgo/core/distributed/matrix.hpp" -#include -#include - #include #include #include #include #include +#include "core/components/prefix_sum_kernels.hpp" #include "core/distributed/matrix_kernels.hpp" -#include "ginkgo/core/base/mtx_io.hpp" namespace gko { @@ -271,18 +268,23 @@ void Matrix::read_distributed( device_matrix_data all_data{exec}; if (assembly_type == assembly::communicate) { - array overlap_count{exec, comm.size()}; + size_type num_entries = data.get_num_stored_elements(); + size_type num_parts = comm.size(); + array overlap_count{exec, num_parts}; + array overlap_positions{exec, num_entries}; + array original_positions{exec, num_entries}; overlap_count.fill(0); auto tmp_part = make_temporary_clone(exec, row_partition); exec->run(matrix::make_count_overlap_entries( - data, tmp_part.get(), local_part, overlap_count)); + data, tmp_part.get(), local_part, overlap_count, overlap_positions, + original_positions)); overlap_count.set_executor(exec->get_master()); std::vector overlap_send_sizes( - overlap_count.get_data(), overlap_count.get_data() + comm.size()); - std::vector overlap_send_offsets(comm.size() + 1); - std::vector overlap_recv_sizes(comm.size()); - std::vector overlap_recv_offsets(comm.size() + 1); + overlap_count.get_data(), overlap_count.get_data() + num_parts); + std::vector overlap_send_offsets(num_parts + 1); + std::vector overlap_recv_sizes(num_parts); + std::vector overlap_recv_offsets(num_parts + 1); std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(), overlap_send_offsets.begin() + 1); @@ -301,14 +303,10 @@ void Matrix::read_distributed( array overlap_recv_row_idxs{exec, n_recv}; array overlap_recv_col_idxs{exec, n_recv}; array overlap_recv_values{exec, n_recv}; - auto offset_array = - make_const_array_view(exec->get_master(), comm.size() + 1, - overlap_send_offsets.data()) - .copy_to_array(); - offset_array.set_executor(exec); exec->run(matrix::make_fill_overlap_send_buffers( - data, tmp_part.get(), local_part, offset_array, - overlap_send_row_idxs, overlap_send_col_idxs, overlap_send_values)); + data, tmp_part.get(), local_part, overlap_positions, + original_positions, overlap_send_row_idxs, overlap_send_col_idxs, + overlap_send_values)); if (use_host_buffer) { overlap_send_row_idxs.set_executor(exec->get_master()); @@ -339,22 +337,21 @@ void Matrix::read_distributed( overlap_recv_values.set_executor(exec); } - size_type n_nnz = data.get_num_stored_elements(); - array all_row_idxs{exec, n_nnz + n_recv}; - array all_col_idxs{exec, n_nnz + n_recv}; - array all_values{exec, n_nnz + n_recv}; - exec->copy_from(exec, n_nnz, data.get_const_row_idxs(), + array all_row_idxs{exec, num_entries + n_recv}; + array all_col_idxs{exec, num_entries + n_recv}; + array all_values{exec, num_entries + n_recv}; + exec->copy_from(exec, num_entries, data.get_const_row_idxs(), all_row_idxs.get_data()); exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(), - all_row_idxs.get_data() + n_nnz); - exec->copy_from(exec, n_nnz, data.get_const_col_idxs(), + all_row_idxs.get_data() + num_entries); + exec->copy_from(exec, num_entries, data.get_const_col_idxs(), all_col_idxs.get_data()); exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(), - all_col_idxs.get_data() + n_nnz); - exec->copy_from(exec, n_nnz, data.get_const_values(), + all_col_idxs.get_data() + num_entries); + exec->copy_from(exec, num_entries, data.get_const_values(), all_values.get_data()); exec->copy_from(exec, n_recv, overlap_recv_values.get_data(), - all_values.get_data() + n_nnz); + all_values.get_data() + num_entries); all_data = device_matrix_data{ exec, global_dim, all_row_idxs, all_col_idxs, all_values}; all_data.sum_duplicates(); diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index fa7e891eb4e..4cdaf3e17fe 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -19,14 +19,16 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \ - GlobalIndexType) \ - void count_overlap_entries( \ - std::shared_ptr exec, \ - const device_matrix_data& input, \ - const experimental::distributed::Partition< \ - LocalIndexType, GlobalIndexType>* row_partition, \ - comm_index_type local_part, array& overlap_count) +#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void count_overlap_entries( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, array& overlap_count, \ + array& overlap_positions, \ + array& original_positions) #define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \ @@ -36,7 +38,9 @@ namespace kernels { const device_matrix_data& input, \ const experimental::distributed::Partition< \ LocalIndexType, GlobalIndexType>* row_partition, \ - comm_index_type local_part, array& offsets, \ + comm_index_type local_part, \ + const array& overlap_positions, \ + const array& original_positions, \ array& overlap_row_idxs, \ array& overlap_col_idxs, \ array& overlap_values) diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index 9225e58ad14..60fc0686473 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -19,8 +19,9 @@ void count_overlap_entries( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, - array& overlap_count) GKO_NOT_IMPLEMENTED; + comm_index_type local_part, array& overlap_count, + array& overlap_positions, + array& original_positions) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_OVERLAP_ENTRIES); @@ -32,7 +33,8 @@ void fill_overlap_send_buffers( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& offsets, + comm_index_type local_part, const array& overlap_positions, + const array& original_positions, array& overlap_row_idxs, array& overlap_col_idxs, array& overlap_values) GKO_NOT_IMPLEMENTED; diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index a3e8cb60868..55ee5524116 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -4,6 +4,8 @@ #include "core/distributed/matrix_kernels.hpp" +#include + #include #include @@ -26,8 +28,50 @@ void count_overlap_entries( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, - array& overlap_count) GKO_NOT_IMPLEMENTED; + comm_index_type local_part, array& overlap_count, + array& overlap_positions, + array& original_positions) +{ + auto num_input_elements = input.get_num_stored_elements(); + auto input_row_idxs = input.get_const_row_idxs(); + auto row_part_ids = row_partition->get_part_ids(); + array row_part_ids_per_entry{exec, num_input_elements}; + + size_type row_range_id = 0; +#pragma omp parallel for firstprivate(row_range_id) + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto global_row = input_row_idxs[i]; + row_range_id = find_range(global_row, row_partition, row_range_id); + auto row_part_id = row_part_ids[row_range_id]; + row_part_ids_per_entry.get_data()[i] = row_part_id; + if (row_part_id != local_part) { +#pragma omp atomic + overlap_count.get_data()[row_part_id]++; + original_positions.get_data()[i] = i; + } else { + original_positions.get_data()[i] = -1; + } + } + + auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) { + comm_index_type a = + i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i]; + comm_index_type b = + j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j]; + return a < b; + }; + std::stable_sort(original_positions.get_data(), + original_positions.get_data() + num_input_elements, comp); + +#pragma omp parallel for + for (size_type i = 0; i < num_input_elements; i++) { + overlap_positions.get_data()[i] = + original_positions.get_const_data()[i] == -1 ? 0 : 1; + } + + components::prefix_sum_nonnegative(exec, overlap_positions.get_data(), + num_input_elements); +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_COUNT_OVERLAP_ENTRIES); @@ -39,10 +83,26 @@ void fill_overlap_send_buffers( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& offsets, + comm_index_type local_part, const array& overlap_positions, + const array& original_positions, array& overlap_row_idxs, - array& overlap_col_idxs, - array& overlap_values) GKO_NOT_IMPLEMENTED; + array& overlap_col_idxs, array& overlap_values) +{ + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + auto input_vals = input.get_const_values(); + +#pragma omp parallel for + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto in_pos = original_positions.get_const_data()[i]; + if (in_pos >= 0) { + auto out_pos = overlap_positions.get_const_data()[i]; + overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; + overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; + overlap_values.get_data()[out_pos] = input_vals[in_pos]; + } + } +} GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index d8b0f9e1d4f..6a57a64e075 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -4,10 +4,12 @@ #include "core/distributed/matrix_kernels.hpp" +#include +#include + #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" -#include "ginkgo/core/distributed/partition.hpp" #include "reference/distributed/partition_helpers.hpp" @@ -23,21 +25,47 @@ void count_overlap_entries( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& overlap_count) + comm_index_type local_part, array& overlap_count, + array& overlap_positions, + array& original_positions) { + auto num_input_elements = input.get_num_stored_elements(); auto input_row_idxs = input.get_const_row_idxs(); auto row_part_ids = row_partition->get_part_ids(); + array row_part_ids_per_entry{exec, num_input_elements}; size_type row_range_id = 0; for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { auto global_row = input_row_idxs[i]; row_range_id = find_range(global_row, row_partition, row_range_id); - row_range_id = find_range(global_row, row_partition, row_range_id); auto row_part_id = row_part_ids[row_range_id]; + row_part_ids_per_entry.get_data()[i] = row_part_id; if (row_part_id != local_part) { overlap_count.get_data()[row_part_id]++; + original_positions.get_data()[i] = i; + } else { + original_positions.get_data()[i] = -1; } } + + auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) { + comm_index_type a = + i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i]; + comm_index_type b = + j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j]; + return a < b; + }; + + std::stable_sort(original_positions.get_data(), + original_positions.get_data() + num_input_elements, comp); + for (size_type i = 0; i < num_input_elements; i++) { + overlap_positions.get_data()[i] = + original_positions.get_const_data()[i] == -1 ? 0 : 1; + } + + std::exclusive_scan(overlap_positions.get_data(), + overlap_positions.get_data() + num_input_elements, + overlap_positions.get_data(), 0); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( @@ -50,26 +78,22 @@ void fill_overlap_send_buffers( const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& offsets, + comm_index_type local_part, const array& overlap_positions, + const array& original_positions, array& overlap_row_idxs, array& overlap_col_idxs, array& overlap_values) { auto input_row_idxs = input.get_const_row_idxs(); auto input_col_idxs = input.get_const_col_idxs(); auto input_vals = input.get_const_values(); - auto row_part_ids = row_partition->get_part_ids(); - size_type row_range_id = 0; for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { - auto global_row = input_row_idxs[i]; - row_range_id = find_range(global_row, row_partition, row_range_id); - row_range_id = find_range(global_row, row_partition, row_range_id); - auto row_part_id = row_part_ids[row_range_id]; - if (row_part_id != local_part) { - auto idx = offsets.get_data()[row_part_id]++; - overlap_row_idxs.get_data()[idx] = global_row; - overlap_col_idxs.get_data()[idx] = input_col_idxs[i]; - overlap_values.get_data()[idx] = input_vals[i]; + auto in_pos = original_positions.get_const_data()[i]; + if (in_pos >= 0) { + auto out_pos = overlap_positions.get_const_data()[i]; + overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; + overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; + overlap_values.get_data()[out_pos] = input_vals[in_pos]; } } } diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index 00d3fcd8895..80fc8eb3330 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -17,8 +17,6 @@ #include #include "core/test/utils.hpp" -#include "ginkgo/core/base/array.hpp" -#include "ginkgo/core/base/types.hpp" namespace { @@ -194,24 +192,37 @@ TYPED_TEST(Matrix, CountOverlapEntries) using git = typename TestFixture::global_index_type; using vt = typename TestFixture::value_type; using ca = gko::array; + using ga = gko::array; this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; std::vector overlap_count_ref{ ca{this->ref, I{0, 5, 3}}, ca{this->ref, I{4, 0, 3}}, ca{this->ref, I{4, 5, 0}}}; + std::vector overlap_pos_ref{ + ga{this->ref, I{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}}, + ga{this->ref, I{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}}, + ga{this->ref, I{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}}; + std::vector original_pos_ref{ + ga{this->ref, I{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}}; comm_index_type num_parts = 3; auto partition = gko::experimental::distributed::Partition::build_from_mapping( this->ref, this->mapping, num_parts); auto input = this->create_input_full_rank(); - gko::array overlap_count{ - this->ref, static_cast(num_parts)}; + ca overlap_count{this->ref, static_cast(num_parts)}; + ga overlap_positions{this->ref, input.get_num_stored_elements()}; + ga original_positions{this->ref, input.get_num_stored_elements()}; for (gko::size_type i = 0; i < num_parts; i++) { overlap_count.fill(0); gko::kernels::reference::distributed_matrix::count_overlap_entries( - this->ref, input, partition.get(), i, overlap_count); + this->ref, input, partition.get(), i, overlap_count, + overlap_positions, original_positions); GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]); + GKO_ASSERT_ARRAY_EQ(overlap_positions, overlap_pos_ref[i]); + GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]); } } @@ -225,10 +236,14 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers) using ga = gko::array; using va = gko::array; this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; - std::vector overlap_offsets{ - ca{this->ref, I{0, 0, 5, 8}}, - ca{this->ref, I{0, 4, 4, 7}}, - ca{this->ref, I{0, 4, 9, 9}}}; + std::vector overlap_positions{ + ga{this->ref, I{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}}, + ga{this->ref, I{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}}, + ga{this->ref, I{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}}; + std::vector original_positions{ + ga{this->ref, I{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}}; std::vector overlap_row_idxs_ref{ ga{this->ref, I{0, 0, 5, 5, 6, 2, 3, 3}}, ga{this->ref, I{1, 1, 4, 4, 2, 3, 3}}, @@ -251,15 +266,14 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers) gko::array overlap_col_idxs{this->ref}; gko::array overlap_values{this->ref}; for (gko::size_type i = 0; i < num_parts; i++) { - overlap_row_idxs.resize_and_reset( - overlap_offsets[i].get_data()[num_parts]); - overlap_col_idxs.resize_and_reset( - overlap_offsets[i].get_data()[num_parts]); - overlap_values.resize_and_reset( - overlap_offsets[i].get_data()[num_parts]); + auto num_entries = overlap_row_idxs_ref[i].get_size(); + overlap_row_idxs.resize_and_reset(num_entries); + overlap_col_idxs.resize_and_reset(num_entries); + overlap_values.resize_and_reset(num_entries); gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers( - this->ref, input, partition.get(), i, overlap_offsets[i], - overlap_row_idxs, overlap_col_idxs, overlap_values); + this->ref, input, partition.get(), i, overlap_positions[i], + original_positions[i], overlap_row_idxs, overlap_col_idxs, + overlap_values); GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]); GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]); GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]); diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index ad91d699496..6de772e8006 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -48,8 +48,9 @@ class Matrix : public CommonTestFixture { { gko::device_matrix_data d_input{exec, input}; - for (comm_index_type part = 0; part < row_partition->get_num_parts(); - ++part) { + gko::size_type num_parts = row_partition->get_num_parts(); + gko::size_type num_entries = input.get_num_stored_elements(); + for (comm_index_type part = 0; part < num_parts; ++part) { gko::array local_row_idxs{ref}; gko::array local_col_idxs{ref}; gko::array local_values{ref}; @@ -62,6 +63,55 @@ class Matrix : public CommonTestFixture { gko::array d_non_local_row_idxs{exec}; gko::array d_non_local_col_idxs{exec}; gko::array d_non_local_values{exec}; + gko::array overlap_count{ref, num_parts}; + overlap_count.fill(0); + gko::array d_overlap_count{exec, num_parts}; + d_overlap_count.fill(0); + gko::array overlap_positions{ref, num_entries}; + gko::array d_overlap_positions{exec, + num_entries}; + gko::array original_positions{ref, num_entries}; + gko::array d_original_positions{exec, + num_entries}; + + gko::kernels::reference::distributed_matrix::count_overlap_entries( + ref, input, row_partition.get(), part, overlap_count, + overlap_positions, original_positions); + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: + count_overlap_entries( + exec, d_input, d_row_partition.get(), part, d_overlap_count, + d_overlap_positions, d_original_positions); + + gko::array overlap_offsets{ref, num_parts + 1}; + std::partial_sum(overlap_count.get_data(), + overlap_count.get_data() + num_parts, + overlap_offsets.get_data() + 1); + overlap_offsets.get_data()[0] = 0; + gko::array d_overlap_offsets{exec, + overlap_offsets}; + gko::size_type num_overlap_entries = + overlap_offsets.get_data()[num_parts]; + gko::array overlap_row_idxs{ref, + num_overlap_entries}; + gko::array overlap_col_idxs{ref, + num_overlap_entries}; + gko::array overlap_values{ref, num_overlap_entries}; + gko::array d_overlap_row_idxs{ + exec, num_overlap_entries}; + gko::array d_overlap_col_idxs{ + exec, num_overlap_entries}; + gko::array d_overlap_values{exec, num_overlap_entries}; + + gko::kernels::reference::distributed_matrix:: + fill_overlap_send_buffers(ref, input, row_partition.get(), part, + overlap_positions, original_positions, + overlap_row_idxs, overlap_col_idxs, + overlap_values); + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: + fill_overlap_send_buffers( + exec, d_input, d_row_partition.get(), part, + d_overlap_positions, d_original_positions, + d_overlap_row_idxs, d_overlap_col_idxs, d_overlap_values); gko::kernels::reference::distributed_matrix:: separate_local_nonlocal( @@ -75,6 +125,12 @@ class Matrix : public CommonTestFixture { d_non_local_row_idxs, d_non_local_col_idxs, d_non_local_values); + GKO_ASSERT_ARRAY_EQ(overlap_positions, d_overlap_positions); + GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions); + GKO_ASSERT_ARRAY_EQ(overlap_count, d_overlap_count); + GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, d_overlap_row_idxs); + GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, d_overlap_col_idxs); + GKO_ASSERT_ARRAY_EQ(overlap_values, d_overlap_values); GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs); GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs); GKO_ASSERT_ARRAY_EQ(local_values, d_local_values); From 69d8d860f32adbb2352f7dd46084e0f0bbef94af Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Fri, 18 Oct 2024 17:45:52 +0200 Subject: [PATCH 5/8] Address Review comments --- .../cuda_hip/distributed/matrix_kernels.cpp | 35 ++-- core/device_hooks/common_kernels.inc.cpp | 4 +- core/distributed/matrix.cpp | 149 +++++++++--------- core/distributed/matrix_kernels.hpp | 68 ++++---- dpcpp/distributed/matrix_kernels.dp.cpp | 20 +-- include/ginkgo/core/distributed/matrix.hpp | 12 +- omp/distributed/matrix_kernels.cpp | 32 ++-- reference/distributed/matrix_kernels.cpp | 36 ++--- reference/test/distributed/matrix_kernels.cpp | 66 ++++---- test/distributed/matrix_kernels.cpp | 91 +++++------ test/mpi/matrix.cpp | 4 +- 11 files changed, 253 insertions(+), 264 deletions(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index ab3ec9da8b1..159ddbce296 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -53,13 +53,13 @@ struct input_type { template -void count_overlap_entries( +void count_non_owning_entries( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& overlap_count, - array& overlap_positions, + comm_index_type local_part, array& send_count, + array& send_positions, array& original_positions) { auto row_part_ids = row_partition->get_part_ids(); @@ -96,13 +96,13 @@ void count_overlap_entries( original_positions.get_data()); run_kernel( exec, - [] GKO_KERNEL(auto i, auto orig_positions, auto overl_positions) { - overl_positions[i] = orig_positions[i] >= 0 ? 1 : 0; + [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) { + s_positions[i] = orig_positions[i] >= 0 ? 1 : 0; }, num_input_elements, original_positions.get_const_data(), - overlap_positions.get_data()); + send_positions.get_data()); - components::prefix_sum_nonnegative(exec, overlap_positions.get_data(), + components::prefix_sum_nonnegative(exec, send_positions.get_data(), num_input_elements); size_type num_parts = row_partition->get_num_parts(); array row_part_ptrs{exec, num_parts + 1}; @@ -116,24 +116,23 @@ void count_overlap_entries( [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) { count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i]; }, - num_parts, local_part, row_part_ptrs.get_data(), - overlap_count.get_data()); + num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); template -void fill_overlap_send_buffers( +void fill_send_buffers( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, const array& overlap_positions, + comm_index_type local_part, const array& send_positions, const array& original_positions, - array& overlap_row_idxs, - array& overlap_col_idxs, array& overlap_values) + array& send_row_idxs, + array& send_col_idxs, array& send_values) { auto num_entries = input.get_num_stored_elements(); auto input_row_idxs = input.get_const_row_idxs(); @@ -152,13 +151,13 @@ void fill_overlap_send_buffers( } }, num_entries, input_row_idxs, input_col_idxs, input_values, - original_positions.get_const_data(), overlap_positions.get_const_data(), - overlap_row_idxs.get_data(), overlap_col_idxs.get_data(), - overlap_values.get_data()); + original_positions.get_const_data(), send_positions.get_const_data(), + send_row_idxs.get_data(), send_col_idxs.get_data(), + send_values.get_data()); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + GKO_DECLARE_FILL_SEND_BUFFERS); template diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 304bed16683..746187330f1 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -274,9 +274,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( namespace distributed_matrix { -GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_COUNT_OVERLAP_ENTRIES); GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS); GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 3799895abf3..2c174f26806 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -21,10 +21,10 @@ namespace matrix { namespace { -GKO_REGISTER_OPERATION(count_overlap_entries, - distributed_matrix::count_overlap_entries); -GKO_REGISTER_OPERATION(fill_overlap_send_buffers, - distributed_matrix::fill_overlap_send_buffers); +GKO_REGISTER_OPERATION(count_non_owning_entries, + distributed_matrix::count_non_owning_entries); +GKO_REGISTER_OPERATION(fill_send_buffers, + distributed_matrix::fill_send_buffers); GKO_REGISTER_OPERATION(separate_local_nonlocal, distributed_matrix::separate_local_nonlocal); @@ -249,7 +249,7 @@ void Matrix::read_distributed( row_partition, std::shared_ptr> col_partition, - assembly assembly_type) + assembly_mode assembly_type) { const auto comm = this->get_communicator(); GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size()); @@ -259,6 +259,8 @@ void Matrix::read_distributed( auto exec = this->get_executor(); auto local_part = comm.rank(); auto use_host_buffer = mpi::requires_host_buffer(exec, comm); + auto tmp_row_partition = make_temporary_clone(exec, row_partition); + auto tmp_col_partition = make_temporary_clone(exec, col_partition); // set up LinOp sizes auto global_num_rows = row_partition->get_size(); @@ -267,74 +269,69 @@ void Matrix::read_distributed( this->set_size(global_dim); device_matrix_data all_data{exec}; - if (assembly_type == assembly::communicate) { + if (assembly_type == assembly_mode::communicate) { size_type num_entries = data.get_num_stored_elements(); size_type num_parts = comm.size(); - array overlap_count{exec, num_parts}; - array overlap_positions{exec, num_entries}; + array send_sizes{exec, num_parts}; + array send_positions{exec, num_entries}; array original_positions{exec, num_entries}; - overlap_count.fill(0); - auto tmp_part = make_temporary_clone(exec, row_partition); - exec->run(matrix::make_count_overlap_entries( - data, tmp_part.get(), local_part, overlap_count, overlap_positions, - original_positions)); - - overlap_count.set_executor(exec->get_master()); - std::vector overlap_send_sizes( - overlap_count.get_data(), overlap_count.get_data() + num_parts); - std::vector overlap_send_offsets(num_parts + 1); - std::vector overlap_recv_sizes(num_parts); - std::vector overlap_recv_offsets(num_parts + 1); - - std::partial_sum(overlap_send_sizes.begin(), overlap_send_sizes.end(), - overlap_send_offsets.begin() + 1); - comm.all_to_all(exec, overlap_send_sizes.data(), 1, - overlap_recv_sizes.data(), 1); - std::partial_sum(overlap_recv_sizes.begin(), overlap_recv_sizes.end(), - overlap_recv_offsets.begin() + 1); - overlap_send_offsets[0] = 0; - overlap_recv_offsets[0] = 0; - - size_type n_send = overlap_send_offsets.back(); - size_type n_recv = overlap_recv_offsets.back(); - array overlap_send_row_idxs{exec, n_send}; - array overlap_send_col_idxs{exec, n_send}; - array overlap_send_values{exec, n_send}; - array overlap_recv_row_idxs{exec, n_recv}; - array overlap_recv_col_idxs{exec, n_recv}; - array overlap_recv_values{exec, n_recv}; - exec->run(matrix::make_fill_overlap_send_buffers( - data, tmp_part.get(), local_part, overlap_positions, - original_positions, overlap_send_row_idxs, overlap_send_col_idxs, - overlap_send_values)); + send_sizes.fill(0); + exec->run(matrix::make_count_non_owning_entries( + data, tmp_row_partition.get(), local_part, send_sizes, + send_positions, original_positions)); + + send_sizes.set_executor(exec->get_master()); + array send_offsets{exec->get_master(), num_parts + 1}; + array recv_sizes{exec->get_master(), num_parts}; + array recv_offsets{exec->get_master(), num_parts + 1}; + + std::partial_sum(send_sizes.get_data(), + send_sizes.get_data() + num_parts, + send_offsets.get_data() + 1); + comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(), + 1); + std::partial_sum(recv_sizes.get_data(), + recv_sizes.get_data() + num_parts, + recv_offsets.get_data() + 1); + send_offsets.get_data()[0] = 0; + recv_offsets.get_data()[0] = 0; + + size_type n_send = send_offsets.get_data()[num_parts]; + size_type n_recv = recv_offsets.get_data()[num_parts]; + array send_row_idxs{exec, n_send}; + array send_col_idxs{exec, n_send}; + array send_values{exec, n_send}; + array recv_row_idxs{exec, n_recv}; + array recv_col_idxs{exec, n_recv}; + array recv_values{exec, n_recv}; + exec->run(matrix::make_fill_send_buffers( + data, tmp_row_partition.get(), local_part, send_positions, + original_positions, send_row_idxs, send_col_idxs, send_values)); if (use_host_buffer) { - overlap_send_row_idxs.set_executor(exec->get_master()); - overlap_send_col_idxs.set_executor(exec->get_master()); - overlap_send_values.set_executor(exec->get_master()); - overlap_recv_row_idxs.set_executor(exec->get_master()); - overlap_recv_col_idxs.set_executor(exec->get_master()); - overlap_recv_values.set_executor(exec->get_master()); + send_row_idxs.set_executor(exec->get_master()); + send_col_idxs.set_executor(exec->get_master()); + send_values.set_executor(exec->get_master()); + recv_row_idxs.set_executor(exec->get_master()); + recv_col_idxs.set_executor(exec->get_master()); + recv_values.set_executor(exec->get_master()); } - comm.all_to_all_v( - use_host_buffer ? exec : exec->get_master(), - overlap_send_row_idxs.get_const_data(), overlap_send_sizes.data(), - overlap_send_offsets.data(), overlap_recv_row_idxs.get_data(), - overlap_recv_sizes.data(), overlap_recv_offsets.data()); - comm.all_to_all_v( - use_host_buffer ? exec : exec->get_master(), - overlap_send_col_idxs.get_const_data(), overlap_send_sizes.data(), - overlap_send_offsets.data(), overlap_recv_col_idxs.get_data(), - overlap_recv_sizes.data(), overlap_recv_offsets.data()); - comm.all_to_all_v( - use_host_buffer ? exec : exec->get_master(), - overlap_send_values.get_const_data(), overlap_send_sizes.data(), - overlap_send_offsets.data(), overlap_recv_values.get_data(), - overlap_recv_sizes.data(), overlap_recv_offsets.data()); + comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(), + send_row_idxs.get_const_data(), send_sizes.get_data(), + send_offsets.get_data(), recv_row_idxs.get_data(), + recv_sizes.get_data(), recv_offsets.get_data()); + comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(), + send_col_idxs.get_const_data(), send_sizes.get_data(), + send_offsets.get_data(), recv_col_idxs.get_data(), + recv_sizes.get_data(), recv_offsets.get_data()); + comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(), + send_values.get_const_data(), send_sizes.get_data(), + send_offsets.get_data(), recv_values.get_data(), + recv_sizes.get_data(), recv_offsets.get_data()); if (use_host_buffer) { - overlap_recv_row_idxs.set_executor(exec); - overlap_recv_col_idxs.set_executor(exec); - overlap_recv_values.set_executor(exec); + recv_row_idxs.set_executor(exec); + recv_col_idxs.set_executor(exec); + recv_values.set_executor(exec); } array all_row_idxs{exec, num_entries + n_recv}; @@ -342,18 +339,19 @@ void Matrix::read_distributed( array all_values{exec, num_entries + n_recv}; exec->copy_from(exec, num_entries, data.get_const_row_idxs(), all_row_idxs.get_data()); - exec->copy_from(exec, n_recv, overlap_recv_row_idxs.get_data(), + exec->copy_from(exec, n_recv, recv_row_idxs.get_data(), all_row_idxs.get_data() + num_entries); exec->copy_from(exec, num_entries, data.get_const_col_idxs(), all_col_idxs.get_data()); - exec->copy_from(exec, n_recv, overlap_recv_col_idxs.get_data(), + exec->copy_from(exec, n_recv, recv_col_idxs.get_data(), all_col_idxs.get_data() + num_entries); exec->copy_from(exec, num_entries, data.get_const_values(), all_values.get_data()); - exec->copy_from(exec, n_recv, overlap_recv_values.get_data(), + exec->copy_from(exec, n_recv, recv_values.get_data(), all_values.get_data() + num_entries); all_data = device_matrix_data{ - exec, global_dim, all_row_idxs, all_col_idxs, all_values}; + exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs), + std::move(all_values)}; all_data.sum_duplicates(); } @@ -370,9 +368,8 @@ void Matrix::read_distributed( // as well as the rows of the non-local block. The columns of the non-local // block are still in global indices. exec->run(matrix::make_separate_local_nonlocal( - assembly_type == assembly::communicate ? all_data : data, - make_temporary_clone(exec, row_partition).get(), - make_temporary_clone(exec, col_partition).get(), local_part, + assembly_type == assembly_mode::communicate ? all_data : data, + tmp_row_partition.get(), tmp_col_partition.get(), local_part, local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, global_non_local_col_idxs, non_local_values)); @@ -456,7 +453,7 @@ void Matrix::read_distributed( row_partition, std::shared_ptr> col_partition, - assembly assembly_type) + assembly_mode assembly_type) { return this->read_distributed( device_matrix_data::create_from_host( @@ -470,7 +467,7 @@ void Matrix::read_distributed( const matrix_data& data, std::shared_ptr> partition, - assembly assembly_type) + assembly_mode assembly_type) { return this->read_distributed( device_matrix_data::create_from_host( @@ -484,7 +481,7 @@ void Matrix::read_distributed( const device_matrix_data& data, std::shared_ptr> partition, - assembly assembly_type) + assembly_mode assembly_type) { return this->read_distributed(data, partition, partition, assembly_type); } diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index 4cdaf3e17fe..3ba02c27718 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -19,31 +19,30 @@ namespace gko { namespace kernels { -#define GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \ - GlobalIndexType) \ - void count_overlap_entries( \ - std::shared_ptr exec, \ - const device_matrix_data& input, \ - const experimental::distributed::Partition< \ - LocalIndexType, GlobalIndexType>* row_partition, \ - comm_index_type local_part, array& overlap_count, \ - array& overlap_positions, \ +#define GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void count_non_owning_entries( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, array& send_count, \ + array& send_positions, \ array& original_positions) -#define GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \ - GlobalIndexType) \ - void fill_overlap_send_buffers( \ - std::shared_ptr exec, \ - const device_matrix_data& input, \ - const experimental::distributed::Partition< \ - LocalIndexType, GlobalIndexType>* row_partition, \ - comm_index_type local_part, \ - const array& overlap_positions, \ - const array& original_positions, \ - array& overlap_row_idxs, \ - array& overlap_col_idxs, \ - array& overlap_values) +#define GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void fill_send_buffers( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, \ + const array& send_positions, \ + const array& original_positions, \ + array& send_row_idxs, \ + array& send_col_idxs, array& send_values) #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ @@ -62,19 +61,18 @@ namespace kernels { array& non_local_values) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - using comm_index_type = experimental::distributed::comm_index_type; \ - template \ - GKO_DECLARE_COUNT_OVERLAP_ENTRIES(ValueType, LocalIndexType, \ - GlobalIndexType); \ - template \ - GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS(ValueType, LocalIndexType, \ - GlobalIndexType); \ - template \ - GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + using comm_index_type = experimental::distributed::comm_index_type; \ + template \ + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType); \ + template \ + GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType); \ + template \ + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ GlobalIndexType) diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index 60fc0686473..e7aad7689a8 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -14,33 +14,33 @@ namespace distributed_matrix { template -void count_overlap_entries( +void non_owningverlap_entries( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& overlap_count, - array& overlap_positions, + comm_index_type local_part, array& send_count, + array& send_positions, array& original_positions) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); template -void fill_overlap_send_buffers( +void fill_send_buffers( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, const array& overlap_positions, + comm_index_type local_part, const array& send_positions, const array& original_positions, - array& overlap_row_idxs, - array& overlap_col_idxs, - array& overlap_values) GKO_NOT_IMPLEMENTED; + array& send_row_idxs, + array& send_col_idxs, + array& send_values) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + GKO_DECLARE_FILL_SEND_BUFFERS); template diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 51e6e498348..3f1885bc8bf 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -134,7 +134,7 @@ namespace distributed { /** - * assembly defines how the read_distributed function of the distributed + * assembly_mode defines how the read_distributed function of the distributed * matrix treats non-local indices in the (device_)matrix_data: * - communicate communicates the overlap between ranks and adds up all local * contributions. Indices smaller than 0 or larger than the global size @@ -142,7 +142,7 @@ namespace distributed { * - local_only does not communicate any overlap but ignores all non-local * indices. */ -enum class assembly { communicate, local_only }; +enum class assembly_mode { communicate, local_only }; template @@ -310,7 +310,7 @@ class Matrix const device_matrix_data& data, std::shared_ptr> partition, - assembly assembly_type = assembly::local_only); + assembly_mode assembly_type = assembly_mode::local_only); /** * Reads a square matrix from the matrix_data structure and a global @@ -325,7 +325,7 @@ class Matrix const matrix_data& data, std::shared_ptr> partition, - assembly assembly_type = assembly::local_only); + assembly_mode assembly_type = assembly_mode::local_only); /** * Reads a matrix from the device_matrix_data structure, a global row @@ -350,7 +350,7 @@ class Matrix row_partition, std::shared_ptr> col_partition, - assembly assembly_type = assembly::local_only); + assembly_mode assembly_type = assembly_mode::local_only); /** * Reads a matrix from the matrix_data structure, a global row partition, @@ -367,7 +367,7 @@ class Matrix row_partition, std::shared_ptr> col_partition, - assembly assembly_type = assembly::local_only); + assembly_mode assembly_type = assembly_mode::local_only); /** * Get read access to the stored local matrix. diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index 55ee5524116..a42b1f230fe 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -23,13 +23,13 @@ namespace distributed_matrix { template -void count_overlap_entries( +void count_non_owning_entries( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& overlap_count, - array& overlap_positions, + comm_index_type local_part, array& send_count, + array& send_positions, array& original_positions) { auto num_input_elements = input.get_num_stored_elements(); @@ -46,7 +46,7 @@ void count_overlap_entries( row_part_ids_per_entry.get_data()[i] = row_part_id; if (row_part_id != local_part) { #pragma omp atomic - overlap_count.get_data()[row_part_id]++; + send_count.get_data()[row_part_id]++; original_positions.get_data()[i] = i; } else { original_positions.get_data()[i] = -1; @@ -65,28 +65,28 @@ void count_overlap_entries( #pragma omp parallel for for (size_type i = 0; i < num_input_elements; i++) { - overlap_positions.get_data()[i] = + send_positions.get_data()[i] = original_positions.get_const_data()[i] == -1 ? 0 : 1; } - components::prefix_sum_nonnegative(exec, overlap_positions.get_data(), + components::prefix_sum_nonnegative(exec, send_positions.get_data(), num_input_elements); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); template -void fill_overlap_send_buffers( +void fill_send_buffers( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, const array& overlap_positions, + comm_index_type local_part, const array& send_positions, const array& original_positions, - array& overlap_row_idxs, - array& overlap_col_idxs, array& overlap_values) + array& send_row_idxs, + array& send_col_idxs, array& send_values) { auto input_row_idxs = input.get_const_row_idxs(); auto input_col_idxs = input.get_const_col_idxs(); @@ -96,16 +96,16 @@ void fill_overlap_send_buffers( for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { auto in_pos = original_positions.get_const_data()[i]; if (in_pos >= 0) { - auto out_pos = overlap_positions.get_const_data()[i]; - overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; - overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; - overlap_values.get_data()[out_pos] = input_vals[in_pos]; + auto out_pos = send_positions.get_const_data()[i]; + send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; + send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; + send_values.get_data()[out_pos] = input_vals[in_pos]; } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + GKO_DECLARE_FILL_SEND_BUFFERS); template diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index 6a57a64e075..1be2660721b 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -20,13 +20,13 @@ namespace distributed_matrix { template -void count_overlap_entries( +void count_non_owning_entries( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, array& overlap_count, - array& overlap_positions, + comm_index_type local_part, array& send_count, + array& send_positions, array& original_positions) { auto num_input_elements = input.get_num_stored_elements(); @@ -41,7 +41,7 @@ void count_overlap_entries( auto row_part_id = row_part_ids[row_range_id]; row_part_ids_per_entry.get_data()[i] = row_part_id; if (row_part_id != local_part) { - overlap_count.get_data()[row_part_id]++; + send_count.get_data()[row_part_id]++; original_positions.get_data()[i] = i; } else { original_positions.get_data()[i] = -1; @@ -59,29 +59,29 @@ void count_overlap_entries( std::stable_sort(original_positions.get_data(), original_positions.get_data() + num_input_elements, comp); for (size_type i = 0; i < num_input_elements; i++) { - overlap_positions.get_data()[i] = + send_positions.get_data()[i] = original_positions.get_const_data()[i] == -1 ? 0 : 1; } - std::exclusive_scan(overlap_positions.get_data(), - overlap_positions.get_data() + num_input_elements, - overlap_positions.get_data(), 0); + std::exclusive_scan(send_positions.get_data(), + send_positions.get_data() + num_input_elements, + send_positions.get_data(), 0); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_COUNT_OVERLAP_ENTRIES); + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); template -void fill_overlap_send_buffers( +void fill_send_buffers( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* row_partition, - comm_index_type local_part, const array& overlap_positions, + comm_index_type local_part, const array& send_positions, const array& original_positions, - array& overlap_row_idxs, - array& overlap_col_idxs, array& overlap_values) + array& send_row_idxs, + array& send_col_idxs, array& send_values) { auto input_row_idxs = input.get_const_row_idxs(); auto input_col_idxs = input.get_const_col_idxs(); @@ -90,16 +90,16 @@ void fill_overlap_send_buffers( for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { auto in_pos = original_positions.get_const_data()[i]; if (in_pos >= 0) { - auto out_pos = overlap_positions.get_const_data()[i]; - overlap_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; - overlap_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; - overlap_values.get_data()[out_pos] = input_vals[in_pos]; + auto out_pos = send_positions.get_const_data()[i]; + send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; + send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; + send_values.get_data()[out_pos] = input_vals[in_pos]; } } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( - GKO_DECLARE_FILL_OVERLAP_SEND_BUFFERS); + GKO_DECLARE_FILL_SEND_BUFFERS); template diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index 80fc8eb3330..80063f7e582 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -190,15 +190,13 @@ TYPED_TEST(Matrix, CountOverlapEntries) { using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; - using vt = typename TestFixture::value_type; using ca = gko::array; using ga = gko::array; this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; - std::vector overlap_count_ref{ - ca{this->ref, I{0, 5, 3}}, - ca{this->ref, I{4, 0, 3}}, - ca{this->ref, I{4, 5, 0}}}; - std::vector overlap_pos_ref{ + std::vector send_count_ref{ca{this->ref, I{0, 5, 3}}, + ca{this->ref, I{4, 0, 3}}, + ca{this->ref, I{4, 5, 0}}}; + std::vector send_pos_ref{ ga{this->ref, I{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}}, ga{this->ref, I{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}}, ga{this->ref, I{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}}; @@ -212,16 +210,18 @@ TYPED_TEST(Matrix, CountOverlapEntries) this->ref, this->mapping, num_parts); auto input = this->create_input_full_rank(); - ca overlap_count{this->ref, static_cast(num_parts)}; - ga overlap_positions{this->ref, input.get_num_stored_elements()}; + ca send_count{this->ref, static_cast(num_parts)}; + ga send_positions{this->ref, input.get_num_stored_elements()}; ga original_positions{this->ref, input.get_num_stored_elements()}; for (gko::size_type i = 0; i < num_parts; i++) { - overlap_count.fill(0); - gko::kernels::reference::distributed_matrix::count_overlap_entries( - this->ref, input, partition.get(), i, overlap_count, - overlap_positions, original_positions); - GKO_ASSERT_ARRAY_EQ(overlap_count, overlap_count_ref[i]); - GKO_ASSERT_ARRAY_EQ(overlap_positions, overlap_pos_ref[i]); + send_count.fill(0); + + gko::kernels::reference::distributed_matrix::count_non_owning_entries( + this->ref, input, partition.get(), i, send_count, send_positions, + original_positions); + + GKO_ASSERT_ARRAY_EQ(send_count, send_count_ref[i]); + GKO_ASSERT_ARRAY_EQ(send_positions, send_pos_ref[i]); GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]); } } @@ -232,11 +232,10 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers) using lit = typename TestFixture::local_index_type; using git = typename TestFixture::global_index_type; using vt = typename TestFixture::value_type; - using ca = gko::array; using ga = gko::array; using va = gko::array; this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; - std::vector overlap_positions{ + std::vector send_positions{ ga{this->ref, I{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}}, ga{this->ref, I{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}}, ga{this->ref, I{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}}; @@ -244,15 +243,15 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers) ga{this->ref, I{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}}, ga{this->ref, I{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}}, ga{this->ref, I{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}}; - std::vector overlap_row_idxs_ref{ + std::vector send_row_idxs_ref{ ga{this->ref, I{0, 0, 5, 5, 6, 2, 3, 3}}, ga{this->ref, I{1, 1, 4, 4, 2, 3, 3}}, ga{this->ref, I{1, 1, 4, 4, 0, 0, 5, 5, 6}}}; - std::vector overlap_col_idxs_ref{ + std::vector send_col_idxs_ref{ ga{this->ref, I{0, 3, 4, 5, 5, 2, 0, 3}}, ga{this->ref, I{1, 2, 4, 6, 2, 0, 3}}, ga{this->ref, I{1, 2, 4, 6, 0, 3, 4, 5, 5}}}; - std::vector overlap_values_ref{ + std::vector send_values_ref{ va{this->ref, I{1, 2, 10, 11, 12, 5, 6, 7}}, va{this->ref, I{3, 4, 8, 9, 5, 6, 7}}, va{this->ref, I{3, 4, 8, 9, 1, 2, 10, 11, 12}}}; @@ -262,21 +261,22 @@ TYPED_TEST(Matrix, FillOverlapSendBuffers) this->ref, this->mapping, num_parts); auto input = this->create_input_full_rank(); - gko::array overlap_row_idxs{this->ref}; - gko::array overlap_col_idxs{this->ref}; - gko::array overlap_values{this->ref}; + gko::array send_row_idxs{this->ref}; + gko::array send_col_idxs{this->ref}; + gko::array send_values{this->ref}; for (gko::size_type i = 0; i < num_parts; i++) { - auto num_entries = overlap_row_idxs_ref[i].get_size(); - overlap_row_idxs.resize_and_reset(num_entries); - overlap_col_idxs.resize_and_reset(num_entries); - overlap_values.resize_and_reset(num_entries); - gko::kernels::reference::distributed_matrix::fill_overlap_send_buffers( - this->ref, input, partition.get(), i, overlap_positions[i], - original_positions[i], overlap_row_idxs, overlap_col_idxs, - overlap_values); - GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, overlap_row_idxs_ref[i]); - GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, overlap_col_idxs_ref[i]); - GKO_ASSERT_ARRAY_EQ(overlap_values, overlap_values_ref[i]); + auto num_entries = send_row_idxs_ref[i].get_size(); + send_row_idxs.resize_and_reset(num_entries); + send_col_idxs.resize_and_reset(num_entries); + send_values.resize_and_reset(num_entries); + + gko::kernels::reference::distributed_matrix::fill_send_buffers( + this->ref, input, partition.get(), i, send_positions[i], + original_positions[i], send_row_idxs, send_col_idxs, send_values); + + GKO_ASSERT_ARRAY_EQ(send_row_idxs, send_row_idxs_ref[i]); + GKO_ASSERT_ARRAY_EQ(send_col_idxs, send_col_idxs_ref[i]); + GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]); } } diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index 6de772e8006..2cef5a49f92 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -63,55 +63,50 @@ class Matrix : public CommonTestFixture { gko::array d_non_local_row_idxs{exec}; gko::array d_non_local_col_idxs{exec}; gko::array d_non_local_values{exec}; - gko::array overlap_count{ref, num_parts}; - overlap_count.fill(0); - gko::array d_overlap_count{exec, num_parts}; - d_overlap_count.fill(0); - gko::array overlap_positions{ref, num_entries}; - gko::array d_overlap_positions{exec, - num_entries}; + gko::array send_count{ref, num_parts}; + send_count.fill(0); + gko::array d_send_count{exec, num_parts}; + d_send_count.fill(0); + gko::array send_positions{ref, num_entries}; + gko::array d_send_positions{exec, num_entries}; gko::array original_positions{ref, num_entries}; gko::array d_original_positions{exec, num_entries}; - gko::kernels::reference::distributed_matrix::count_overlap_entries( - ref, input, row_partition.get(), part, overlap_count, - overlap_positions, original_positions); - gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: - count_overlap_entries( - exec, d_input, d_row_partition.get(), part, d_overlap_count, - d_overlap_positions, d_original_positions); - - gko::array overlap_offsets{ref, num_parts + 1}; - std::partial_sum(overlap_count.get_data(), - overlap_count.get_data() + num_parts, - overlap_offsets.get_data() + 1); - overlap_offsets.get_data()[0] = 0; - gko::array d_overlap_offsets{exec, - overlap_offsets}; - gko::size_type num_overlap_entries = - overlap_offsets.get_data()[num_parts]; - gko::array overlap_row_idxs{ref, - num_overlap_entries}; - gko::array overlap_col_idxs{ref, - num_overlap_entries}; - gko::array overlap_values{ref, num_overlap_entries}; - gko::array d_overlap_row_idxs{ - exec, num_overlap_entries}; - gko::array d_overlap_col_idxs{ - exec, num_overlap_entries}; - gko::array d_overlap_values{exec, num_overlap_entries}; - gko::kernels::reference::distributed_matrix:: - fill_overlap_send_buffers(ref, input, row_partition.get(), part, - overlap_positions, original_positions, - overlap_row_idxs, overlap_col_idxs, - overlap_values); + count_non_owning_entries(ref, input, row_partition.get(), part, + send_count, send_positions, + original_positions); + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: + count_non_owning_entries(exec, d_input, d_row_partition.get(), + part, d_send_count, d_send_positions, + d_original_positions); + + gko::array send_offsets{ref, num_parts + 1}; + std::partial_sum(send_count.get_data(), + send_count.get_data() + num_parts, + send_offsets.get_data() + 1); + send_offsets.get_data()[0] = 0; + gko::array d_send_offsets{exec, send_offsets}; + gko::size_type num_send_entries = + send_offsets.get_data()[num_parts]; + gko::array send_row_idxs{ref, num_send_entries}; + gko::array send_col_idxs{ref, num_send_entries}; + gko::array send_values{ref, num_send_entries}; + gko::array d_send_row_idxs{exec, + num_send_entries}; + gko::array d_send_col_idxs{exec, + num_send_entries}; + gko::array d_send_values{exec, num_send_entries}; + + gko::kernels::reference::distributed_matrix::fill_send_buffers( + ref, input, row_partition.get(), part, send_positions, + original_positions, send_row_idxs, send_col_idxs, send_values); gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: - fill_overlap_send_buffers( - exec, d_input, d_row_partition.get(), part, - d_overlap_positions, d_original_positions, - d_overlap_row_idxs, d_overlap_col_idxs, d_overlap_values); + fill_send_buffers(exec, d_input, d_row_partition.get(), part, + d_send_positions, d_original_positions, + d_send_row_idxs, d_send_col_idxs, + d_send_values); gko::kernels::reference::distributed_matrix:: separate_local_nonlocal( @@ -125,12 +120,12 @@ class Matrix : public CommonTestFixture { d_non_local_row_idxs, d_non_local_col_idxs, d_non_local_values); - GKO_ASSERT_ARRAY_EQ(overlap_positions, d_overlap_positions); + GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions); GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions); - GKO_ASSERT_ARRAY_EQ(overlap_count, d_overlap_count); - GKO_ASSERT_ARRAY_EQ(overlap_row_idxs, d_overlap_row_idxs); - GKO_ASSERT_ARRAY_EQ(overlap_col_idxs, d_overlap_col_idxs); - GKO_ASSERT_ARRAY_EQ(overlap_values, d_overlap_values); + GKO_ASSERT_ARRAY_EQ(send_count, d_send_count); + GKO_ASSERT_ARRAY_EQ(send_row_idxs, d_send_row_idxs); + GKO_ASSERT_ARRAY_EQ(send_col_idxs, d_send_col_idxs); + GKO_ASSERT_ARRAY_EQ(send_values, d_send_values); GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs); GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs); GKO_ASSERT_ARRAY_EQ(local_values, d_local_values); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 1b25ad7eb6d..0cfb3aca477 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -152,7 +152,7 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate) this->dist_mat->read_distributed( this->dist_input[rank], this->row_part, - gko::experimental::distributed::assembly::communicate); + gko::experimental::distributed::assembly_mode::communicate); GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), res_local[rank], 0); @@ -191,7 +191,7 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate) this->dist_mat->read_distributed( this->dist_input[rank], this->row_part, this->col_part, - gko::experimental::distributed::assembly::communicate); + gko::experimental::distributed::assembly_mode::communicate); GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), res_local[rank], 0); From fda55dac669a1d6bb261a04663314af293bae01c Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Mon, 21 Oct 2024 12:43:30 +0200 Subject: [PATCH 6/8] Small fixes --- core/distributed/matrix.cpp | 2 +- dpcpp/distributed/matrix_kernels.dp.cpp | 2 +- reference/distributed/matrix_kernels.cpp | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 2c174f26806..acd1b415ef5 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -275,7 +275,7 @@ void Matrix::read_distributed( array send_sizes{exec, num_parts}; array send_positions{exec, num_entries}; array original_positions{exec, num_entries}; - send_sizes.fill(0); + send_sizes.fill(zero()); exec->run(matrix::make_count_non_owning_entries( data, tmp_row_partition.get(), local_part, send_sizes, send_positions, original_positions)); diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index e7aad7689a8..8582fab9378 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -14,7 +14,7 @@ namespace distributed_matrix { template -void non_owningverlap_entries( +void count_non_owning_entries( std::shared_ptr exec, const device_matrix_data& input, const experimental::distributed::Partition* diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index 1be2660721b..915ec109657 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -5,11 +5,11 @@ #include "core/distributed/matrix_kernels.hpp" #include -#include #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "reference/distributed/partition_helpers.hpp" @@ -63,9 +63,8 @@ void count_non_owning_entries( original_positions.get_const_data()[i] == -1 ? 0 : 1; } - std::exclusive_scan(send_positions.get_data(), - send_positions.get_data() + num_input_elements, - send_positions.get_data(), 0); + components::prefix_sum_nonnegative(exec, send_positions.get_data(), + num_input_elements); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( From a7900d5f8ad60b7b114a0f66ff009a625c0ca784 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Mon, 21 Oct 2024 14:00:26 +0200 Subject: [PATCH 7/8] Add missing include --- core/distributed/matrix.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index acd1b415ef5..dc04e9a9545 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -4,6 +4,7 @@ #include "ginkgo/core/distributed/matrix.hpp" +#include #include #include #include From 1efda8fc6a383ff97be6f73917d3e93401c67c6c Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Tue, 22 Oct 2024 10:49:55 +0200 Subject: [PATCH 8/8] Fix circular dependency with array.fill --- common/cuda_hip/distributed/matrix_kernels.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index 159ddbce296..a1a75a5af63 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -21,6 +21,7 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/atomic.hpp" #include "common/unified/base/kernel_launch.hpp" +#include "core/components/fill_array_kernels.hpp" #include "core/components/format_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" @@ -106,7 +107,9 @@ void count_non_owning_entries( num_input_elements); size_type num_parts = row_partition->get_num_parts(); array row_part_ptrs{exec, num_parts + 1}; - row_part_ptrs.fill(0); + components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1, + zero()); + components::convert_idxs_to_ptrs( exec, row_part_ids_per_entry.get_const_data(), num_input_elements, num_parts, row_part_ptrs.get_data());