diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index 88988febbb0..a1a75a5af63 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -20,6 +20,10 @@ #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/atomic.hpp" +#include "common/unified/base/kernel_launch.hpp" +#include "core/components/fill_array_kernels.hpp" +#include "core/components/format_conversion_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" namespace gko { @@ -49,6 +53,116 @@ struct input_type { }; +template +void count_non_owning_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& send_count, + array& send_positions, + array& original_positions) +{ + auto row_part_ids = row_partition->get_part_ids(); + const auto* row_range_bounds = row_partition->get_range_bounds(); + const auto* row_range_starting_indices = + row_partition->get_range_starting_indices(); + const auto num_row_ranges = row_partition->get_num_ranges(); + const auto num_input_elements = input.get_num_stored_elements(); + + auto policy = thrust_policy(exec); + + // precompute the row and column range id of each input element + auto input_row_idxs = input.get_const_row_idxs(); + array row_range_ids{exec, num_input_elements}; + thrust::upper_bound(policy, row_range_bounds + 1, + row_range_bounds + num_row_ranges + 1, input_row_idxs, + input_row_idxs + num_input_elements, + row_range_ids.get_data()); + + array row_part_ids_per_entry{exec, num_input_elements}; + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids, + auto part_ids_per_entry, auto orig_positions) { + part_ids_per_entry[i] = part_ids[range_ids[i]]; + orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i; + }, + num_input_elements, local_part, row_part_ids, row_range_ids.get_data(), + row_part_ids_per_entry.get_data(), original_positions.get_data()); + + thrust::stable_sort_by_key( + policy, row_part_ids_per_entry.get_data(), + row_part_ids_per_entry.get_data() + num_input_elements, + original_positions.get_data()); + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) { + s_positions[i] = orig_positions[i] >= 0 ? 1 : 0; + }, + num_input_elements, original_positions.get_const_data(), + send_positions.get_data()); + + components::prefix_sum_nonnegative(exec, send_positions.get_data(), + num_input_elements); + size_type num_parts = row_partition->get_num_parts(); + array row_part_ptrs{exec, num_parts + 1}; + components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1, + zero()); + + components::convert_idxs_to_ptrs( + exec, row_part_ids_per_entry.get_const_data(), num_input_elements, + num_parts, row_part_ptrs.get_data()); + + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) { + count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i]; + }, + num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); + + +template +void fill_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, const array& send_positions, + const array& original_positions, + array& send_row_idxs, + array& send_col_idxs, array& send_values) +{ + auto num_entries = input.get_num_stored_elements(); + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + auto input_values = input.get_const_values(); + + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals, + auto in_pos, auto out_pos, auto out_rows, auto out_cols, + auto out_vals) { + if (in_pos[i] >= 0) { + out_rows[out_pos[i]] = in_rows[in_pos[i]]; + out_cols[out_pos[i]] = in_cols[in_pos[i]]; + out_vals[out_pos[i]] = in_vals[in_pos[i]]; + } + }, + num_entries, input_row_idxs, input_col_idxs, input_values, + original_positions.get_const_data(), send_positions.get_const_data(), + send_row_idxs.get_data(), send_col_idxs.get_data(), + send_values.get_data()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 1ba925e94e3..746187330f1 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -274,6 +274,9 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( namespace distributed_matrix { +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_FILL_SEND_BUFFERS); GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL); diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 63f359cc40a..dc04e9a9545 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -4,12 +4,14 @@ #include "ginkgo/core/distributed/matrix.hpp" +#include #include #include #include #include #include +#include "core/components/prefix_sum_kernels.hpp" #include "core/distributed/matrix_kernels.hpp" @@ -20,6 +22,10 @@ namespace matrix { namespace { +GKO_REGISTER_OPERATION(count_non_owning_entries, + distributed_matrix::count_non_owning_entries); +GKO_REGISTER_OPERATION(fill_send_buffers, + distributed_matrix::fill_send_buffers); GKO_REGISTER_OPERATION(separate_local_nonlocal, distributed_matrix::separate_local_nonlocal); @@ -243,7 +249,8 @@ void Matrix::read_distributed( std::shared_ptr> row_partition, std::shared_ptr> - col_partition) + col_partition, + assembly_mode assembly_type) { const auto comm = this->get_communicator(); GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size()); @@ -252,14 +259,103 @@ void Matrix::read_distributed( GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts()); auto exec = this->get_executor(); auto local_part = comm.rank(); + auto use_host_buffer = mpi::requires_host_buffer(exec, comm); + auto tmp_row_partition = make_temporary_clone(exec, row_partition); + auto tmp_col_partition = make_temporary_clone(exec, col_partition); // set up LinOp sizes - auto num_parts = static_cast(row_partition->get_num_parts()); auto global_num_rows = row_partition->get_size(); auto global_num_cols = col_partition->get_size(); dim<2> global_dim{global_num_rows, global_num_cols}; this->set_size(global_dim); + device_matrix_data all_data{exec}; + if (assembly_type == assembly_mode::communicate) { + size_type num_entries = data.get_num_stored_elements(); + size_type num_parts = comm.size(); + array send_sizes{exec, num_parts}; + array send_positions{exec, num_entries}; + array original_positions{exec, num_entries}; + send_sizes.fill(zero()); + exec->run(matrix::make_count_non_owning_entries( + data, tmp_row_partition.get(), local_part, send_sizes, + send_positions, original_positions)); + + send_sizes.set_executor(exec->get_master()); + array send_offsets{exec->get_master(), num_parts + 1}; + array recv_sizes{exec->get_master(), num_parts}; + array recv_offsets{exec->get_master(), num_parts + 1}; + + std::partial_sum(send_sizes.get_data(), + send_sizes.get_data() + num_parts, + send_offsets.get_data() + 1); + comm.all_to_all(exec, send_sizes.get_data(), 1, recv_sizes.get_data(), + 1); + std::partial_sum(recv_sizes.get_data(), + recv_sizes.get_data() + num_parts, + recv_offsets.get_data() + 1); + send_offsets.get_data()[0] = 0; + recv_offsets.get_data()[0] = 0; + + size_type n_send = send_offsets.get_data()[num_parts]; + size_type n_recv = recv_offsets.get_data()[num_parts]; + array send_row_idxs{exec, n_send}; + array send_col_idxs{exec, n_send}; + array send_values{exec, n_send}; + array recv_row_idxs{exec, n_recv}; + array recv_col_idxs{exec, n_recv}; + array recv_values{exec, n_recv}; + exec->run(matrix::make_fill_send_buffers( + data, tmp_row_partition.get(), local_part, send_positions, + original_positions, send_row_idxs, send_col_idxs, send_values)); + + if (use_host_buffer) { + send_row_idxs.set_executor(exec->get_master()); + send_col_idxs.set_executor(exec->get_master()); + send_values.set_executor(exec->get_master()); + recv_row_idxs.set_executor(exec->get_master()); + recv_col_idxs.set_executor(exec->get_master()); + recv_values.set_executor(exec->get_master()); + } + comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(), + send_row_idxs.get_const_data(), send_sizes.get_data(), + send_offsets.get_data(), recv_row_idxs.get_data(), + recv_sizes.get_data(), recv_offsets.get_data()); + comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(), + send_col_idxs.get_const_data(), send_sizes.get_data(), + send_offsets.get_data(), recv_col_idxs.get_data(), + recv_sizes.get_data(), recv_offsets.get_data()); + comm.all_to_all_v(use_host_buffer ? exec : exec->get_master(), + send_values.get_const_data(), send_sizes.get_data(), + send_offsets.get_data(), recv_values.get_data(), + recv_sizes.get_data(), recv_offsets.get_data()); + if (use_host_buffer) { + recv_row_idxs.set_executor(exec); + recv_col_idxs.set_executor(exec); + recv_values.set_executor(exec); + } + + array all_row_idxs{exec, num_entries + n_recv}; + array all_col_idxs{exec, num_entries + n_recv}; + array all_values{exec, num_entries + n_recv}; + exec->copy_from(exec, num_entries, data.get_const_row_idxs(), + all_row_idxs.get_data()); + exec->copy_from(exec, n_recv, recv_row_idxs.get_data(), + all_row_idxs.get_data() + num_entries); + exec->copy_from(exec, num_entries, data.get_const_col_idxs(), + all_col_idxs.get_data()); + exec->copy_from(exec, n_recv, recv_col_idxs.get_data(), + all_col_idxs.get_data() + num_entries); + exec->copy_from(exec, num_entries, data.get_const_values(), + all_values.get_data()); + exec->copy_from(exec, n_recv, recv_values.get_data(), + all_values.get_data() + num_entries); + all_data = device_matrix_data{ + exec, global_dim, std::move(all_row_idxs), std::move(all_col_idxs), + std::move(all_values)}; + all_data.sum_duplicates(); + } + // temporary storage for the output array local_row_idxs{exec}; array local_col_idxs{exec}; @@ -273,8 +369,8 @@ void Matrix::read_distributed( // as well as the rows of the non-local block. The columns of the non-local // block are still in global indices. exec->run(matrix::make_separate_local_nonlocal( - data, make_temporary_clone(exec, row_partition).get(), - make_temporary_clone(exec, col_partition).get(), local_part, + assembly_type == assembly_mode::communicate ? all_data : data, + tmp_row_partition.get(), tmp_col_partition.get(), local_part, local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, global_non_local_col_idxs, non_local_values)); @@ -335,7 +431,6 @@ void Matrix::read_distributed( imap.get_executor(), imap.get_non_local_size(), imap.get_remote_local_idxs().get_const_flat_data()) .copy_to_array(); - auto use_host_buffer = mpi::requires_host_buffer(exec, comm); if (use_host_buffer) { recv_gather_idxs.set_executor(exec->get_master()); gather_idxs_.clear(); @@ -358,12 +453,13 @@ void Matrix::read_distributed( std::shared_ptr> row_partition, std::shared_ptr> - col_partition) + col_partition, + assembly_mode assembly_type) { return this->read_distributed( device_matrix_data::create_from_host( this->get_executor(), data), - row_partition, col_partition); + row_partition, col_partition, assembly_type); } @@ -371,12 +467,13 @@ template void Matrix::read_distributed( const matrix_data& data, std::shared_ptr> - partition) + partition, + assembly_mode assembly_type) { return this->read_distributed( device_matrix_data::create_from_host( this->get_executor(), data), - partition, partition); + partition, partition, assembly_type); } @@ -384,9 +481,10 @@ template void Matrix::read_distributed( const device_matrix_data& data, std::shared_ptr> - partition) + partition, + assembly_mode assembly_type) { - return this->read_distributed(data, partition, partition); + return this->read_distributed(data, partition, partition, assembly_type); } diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index f24e8c9945e..3ba02c27718 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -19,6 +19,32 @@ namespace gko { namespace kernels { +#define GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void count_non_owning_entries( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, array& send_count, \ + array& send_positions, \ + array& original_positions) + + +#define GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void fill_send_buffers( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + comm_index_type local_part, \ + const array& send_positions, \ + const array& original_positions, \ + array& send_row_idxs, \ + array& send_col_idxs, array& send_values) + + #define GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ GlobalIndexType) \ void separate_local_nonlocal( \ @@ -35,11 +61,18 @@ namespace kernels { array& non_local_values) -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - using comm_index_type = experimental::distributed::comm_index_type; \ - template \ - GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + using comm_index_type = experimental::distributed::comm_index_type; \ + template \ + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES(ValueType, LocalIndexType, \ + GlobalIndexType); \ + template \ + GKO_DECLARE_FILL_SEND_BUFFERS(ValueType, LocalIndexType, GlobalIndexType); \ + template \ + GKO_DECLARE_SEPARATE_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ GlobalIndexType) diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp index 47adaaeca59..8582fab9378 100644 --- a/dpcpp/distributed/matrix_kernels.dp.cpp +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -13,6 +13,36 @@ namespace dpcpp { namespace distributed_matrix { +template +void count_non_owning_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& send_count, + array& send_positions, + array& original_positions) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); + + +template +void fill_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, const array& send_positions, + const array& original_positions, + array& send_row_idxs, + array& send_col_idxs, + array& send_values) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index de719bb9315..3f1885bc8bf 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -133,6 +133,18 @@ namespace experimental { namespace distributed { +/** + * assembly_mode defines how the read_distributed function of the distributed + * matrix treats non-local indices in the (device_)matrix_data: + * - communicate communicates the overlap between ranks and adds up all local + * contributions. Indices smaller than 0 or larger than the global size + * of the matrix are ignored. + * - local_only does not communicate any overlap but ignores all non-local + * indices. + */ +enum class assembly_mode { communicate, local_only }; + + template class Partition; template @@ -297,7 +309,8 @@ class Matrix void read_distributed( const device_matrix_data& data, std::shared_ptr> - partition); + partition, + assembly_mode assembly_type = assembly_mode::local_only); /** * Reads a square matrix from the matrix_data structure and a global @@ -311,7 +324,8 @@ class Matrix void read_distributed( const matrix_data& data, std::shared_ptr> - partition); + partition, + assembly_mode assembly_type = assembly_mode::local_only); /** * Reads a matrix from the device_matrix_data structure, a global row @@ -335,7 +349,8 @@ class Matrix std::shared_ptr> row_partition, std::shared_ptr> - col_partition); + col_partition, + assembly_mode assembly_type = assembly_mode::local_only); /** * Reads a matrix from the matrix_data structure, a global row partition, @@ -351,7 +366,8 @@ class Matrix std::shared_ptr> row_partition, std::shared_ptr> - col_partition); + col_partition, + assembly_mode assembly_type = assembly_mode::local_only); /** * Get read access to the stored local matrix. diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp index 2f36ec4a778..a42b1f230fe 100644 --- a/omp/distributed/matrix_kernels.cpp +++ b/omp/distributed/matrix_kernels.cpp @@ -4,6 +4,8 @@ #include "core/distributed/matrix_kernels.hpp" +#include + #include #include @@ -20,6 +22,92 @@ namespace omp { namespace distributed_matrix { +template +void count_non_owning_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& send_count, + array& send_positions, + array& original_positions) +{ + auto num_input_elements = input.get_num_stored_elements(); + auto input_row_idxs = input.get_const_row_idxs(); + auto row_part_ids = row_partition->get_part_ids(); + array row_part_ids_per_entry{exec, num_input_elements}; + + size_type row_range_id = 0; +#pragma omp parallel for firstprivate(row_range_id) + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto global_row = input_row_idxs[i]; + row_range_id = find_range(global_row, row_partition, row_range_id); + auto row_part_id = row_part_ids[row_range_id]; + row_part_ids_per_entry.get_data()[i] = row_part_id; + if (row_part_id != local_part) { +#pragma omp atomic + send_count.get_data()[row_part_id]++; + original_positions.get_data()[i] = i; + } else { + original_positions.get_data()[i] = -1; + } + } + + auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) { + comm_index_type a = + i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i]; + comm_index_type b = + j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j]; + return a < b; + }; + std::stable_sort(original_positions.get_data(), + original_positions.get_data() + num_input_elements, comp); + +#pragma omp parallel for + for (size_type i = 0; i < num_input_elements; i++) { + send_positions.get_data()[i] = + original_positions.get_const_data()[i] == -1 ? 0 : 1; + } + + components::prefix_sum_nonnegative(exec, send_positions.get_data(), + num_input_elements); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); + + +template +void fill_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, const array& send_positions, + const array& original_positions, + array& send_row_idxs, + array& send_col_idxs, array& send_values) +{ + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + auto input_vals = input.get_const_values(); + +#pragma omp parallel for + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto in_pos = original_positions.get_const_data()[i]; + if (in_pos >= 0) { + auto out_pos = send_positions.get_const_data()[i]; + send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; + send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; + send_values.get_data()[out_pos] = input_vals[in_pos]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp index 95176b34656..915ec109657 100644 --- a/reference/distributed/matrix_kernels.cpp +++ b/reference/distributed/matrix_kernels.cpp @@ -4,9 +4,12 @@ #include "core/distributed/matrix_kernels.hpp" +#include + #include "core/base/allocator.hpp" #include "core/base/device_matrix_data_kernels.hpp" #include "core/base/iterator_factory.hpp" +#include "core/components/prefix_sum_kernels.hpp" #include "reference/distributed/partition_helpers.hpp" @@ -16,6 +19,88 @@ namespace reference { namespace distributed_matrix { +template +void count_non_owning_entries( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, array& send_count, + array& send_positions, + array& original_positions) +{ + auto num_input_elements = input.get_num_stored_elements(); + auto input_row_idxs = input.get_const_row_idxs(); + auto row_part_ids = row_partition->get_part_ids(); + array row_part_ids_per_entry{exec, num_input_elements}; + + size_type row_range_id = 0; + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto global_row = input_row_idxs[i]; + row_range_id = find_range(global_row, row_partition, row_range_id); + auto row_part_id = row_part_ids[row_range_id]; + row_part_ids_per_entry.get_data()[i] = row_part_id; + if (row_part_id != local_part) { + send_count.get_data()[row_part_id]++; + original_positions.get_data()[i] = i; + } else { + original_positions.get_data()[i] = -1; + } + } + + auto comp = [row_part_ids_per_entry, local_part](auto i, auto j) { + comm_index_type a = + i == -1 ? local_part : row_part_ids_per_entry.get_const_data()[i]; + comm_index_type b = + j == -1 ? local_part : row_part_ids_per_entry.get_const_data()[j]; + return a < b; + }; + + std::stable_sort(original_positions.get_data(), + original_positions.get_data() + num_input_elements, comp); + for (size_type i = 0; i < num_input_elements; i++) { + send_positions.get_data()[i] = + original_positions.get_const_data()[i] == -1 ? 0 : 1; + } + + components::prefix_sum_nonnegative(exec, send_positions.get_data(), + num_input_elements); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_COUNT_NON_OWNING_ENTRIES); + + +template +void fill_send_buffers( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + comm_index_type local_part, const array& send_positions, + const array& original_positions, + array& send_row_idxs, + array& send_col_idxs, array& send_values) +{ + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + auto input_vals = input.get_const_values(); + + for (size_type i = 0; i < input.get_num_stored_elements(); ++i) { + auto in_pos = original_positions.get_const_data()[i]; + if (in_pos >= 0) { + auto out_pos = send_positions.get_const_data()[i]; + send_row_idxs.get_data()[out_pos] = input_row_idxs[in_pos]; + send_col_idxs.get_data()[out_pos] = input_col_idxs[in_pos]; + send_values.get_data()[out_pos] = input_vals[in_pos]; + } + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILL_SEND_BUFFERS); + + template void separate_local_nonlocal( std::shared_ptr exec, diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp index a34844cbde9..80063f7e582 100644 --- a/reference/test/distributed/matrix_kernels.cpp +++ b/reference/test/distributed/matrix_kernels.cpp @@ -186,6 +186,101 @@ TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes, TupleTypenameNameGenerator); +TYPED_TEST(Matrix, CountOverlapEntries) +{ + using lit = typename TestFixture::local_index_type; + using git = typename TestFixture::global_index_type; + using ca = gko::array; + using ga = gko::array; + this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; + std::vector send_count_ref{ca{this->ref, I{0, 5, 3}}, + ca{this->ref, I{4, 0, 3}}, + ca{this->ref, I{4, 5, 0}}}; + std::vector send_pos_ref{ + ga{this->ref, I{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}}, + ga{this->ref, I{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}}, + ga{this->ref, I{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}}; + std::vector original_pos_ref{ + ga{this->ref, I{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}}; + comm_index_type num_parts = 3; + auto partition = + gko::experimental::distributed::Partition::build_from_mapping( + this->ref, this->mapping, num_parts); + auto input = this->create_input_full_rank(); + + ca send_count{this->ref, static_cast(num_parts)}; + ga send_positions{this->ref, input.get_num_stored_elements()}; + ga original_positions{this->ref, input.get_num_stored_elements()}; + for (gko::size_type i = 0; i < num_parts; i++) { + send_count.fill(0); + + gko::kernels::reference::distributed_matrix::count_non_owning_entries( + this->ref, input, partition.get(), i, send_count, send_positions, + original_positions); + + GKO_ASSERT_ARRAY_EQ(send_count, send_count_ref[i]); + GKO_ASSERT_ARRAY_EQ(send_positions, send_pos_ref[i]); + GKO_ASSERT_ARRAY_EQ(original_positions, original_pos_ref[i]); + } +} + + +TYPED_TEST(Matrix, FillOverlapSendBuffers) +{ + using lit = typename TestFixture::local_index_type; + using git = typename TestFixture::global_index_type; + using vt = typename TestFixture::value_type; + using ga = gko::array; + using va = gko::array; + this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1}}; + std::vector send_positions{ + ga{this->ref, I{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7}}, + ga{this->ref, I{0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 5, 6}}, + ga{this->ref, I{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9}}}; + std::vector original_positions{ + ga{this->ref, I{-1, -1, -1, -1, 0, 1, 9, 10, 11, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, -1, -1, -1, -1, -1, 4, 5, 6}}, + ga{this->ref, I{2, 3, 7, 8, 0, 1, 9, 10, 11, -1, -1, -1}}}; + std::vector send_row_idxs_ref{ + ga{this->ref, I{0, 0, 5, 5, 6, 2, 3, 3}}, + ga{this->ref, I{1, 1, 4, 4, 2, 3, 3}}, + ga{this->ref, I{1, 1, 4, 4, 0, 0, 5, 5, 6}}}; + std::vector send_col_idxs_ref{ + ga{this->ref, I{0, 3, 4, 5, 5, 2, 0, 3}}, + ga{this->ref, I{1, 2, 4, 6, 2, 0, 3}}, + ga{this->ref, I{1, 2, 4, 6, 0, 3, 4, 5, 5}}}; + std::vector send_values_ref{ + va{this->ref, I{1, 2, 10, 11, 12, 5, 6, 7}}, + va{this->ref, I{3, 4, 8, 9, 5, 6, 7}}, + va{this->ref, I{3, 4, 8, 9, 1, 2, 10, 11, 12}}}; + comm_index_type num_parts = 3; + auto partition = + gko::experimental::distributed::Partition::build_from_mapping( + this->ref, this->mapping, num_parts); + auto input = this->create_input_full_rank(); + + gko::array send_row_idxs{this->ref}; + gko::array send_col_idxs{this->ref}; + gko::array send_values{this->ref}; + for (gko::size_type i = 0; i < num_parts; i++) { + auto num_entries = send_row_idxs_ref[i].get_size(); + send_row_idxs.resize_and_reset(num_entries); + send_col_idxs.resize_and_reset(num_entries); + send_values.resize_and_reset(num_entries); + + gko::kernels::reference::distributed_matrix::fill_send_buffers( + this->ref, input, partition.get(), i, send_positions[i], + original_positions[i], send_row_idxs, send_col_idxs, send_values); + + GKO_ASSERT_ARRAY_EQ(send_row_idxs, send_row_idxs_ref[i]); + GKO_ASSERT_ARRAY_EQ(send_col_idxs, send_col_idxs_ref[i]); + GKO_ASSERT_ARRAY_EQ(send_values, send_values_ref[i]); + } +} + + TYPED_TEST(Matrix, SeparateLocalNonLocalEmpty) { using lit = typename TestFixture::local_index_type; diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp index ad91d699496..2cef5a49f92 100644 --- a/test/distributed/matrix_kernels.cpp +++ b/test/distributed/matrix_kernels.cpp @@ -48,8 +48,9 @@ class Matrix : public CommonTestFixture { { gko::device_matrix_data d_input{exec, input}; - for (comm_index_type part = 0; part < row_partition->get_num_parts(); - ++part) { + gko::size_type num_parts = row_partition->get_num_parts(); + gko::size_type num_entries = input.get_num_stored_elements(); + for (comm_index_type part = 0; part < num_parts; ++part) { gko::array local_row_idxs{ref}; gko::array local_col_idxs{ref}; gko::array local_values{ref}; @@ -62,6 +63,50 @@ class Matrix : public CommonTestFixture { gko::array d_non_local_row_idxs{exec}; gko::array d_non_local_col_idxs{exec}; gko::array d_non_local_values{exec}; + gko::array send_count{ref, num_parts}; + send_count.fill(0); + gko::array d_send_count{exec, num_parts}; + d_send_count.fill(0); + gko::array send_positions{ref, num_entries}; + gko::array d_send_positions{exec, num_entries}; + gko::array original_positions{ref, num_entries}; + gko::array d_original_positions{exec, + num_entries}; + + gko::kernels::reference::distributed_matrix:: + count_non_owning_entries(ref, input, row_partition.get(), part, + send_count, send_positions, + original_positions); + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: + count_non_owning_entries(exec, d_input, d_row_partition.get(), + part, d_send_count, d_send_positions, + d_original_positions); + + gko::array send_offsets{ref, num_parts + 1}; + std::partial_sum(send_count.get_data(), + send_count.get_data() + num_parts, + send_offsets.get_data() + 1); + send_offsets.get_data()[0] = 0; + gko::array d_send_offsets{exec, send_offsets}; + gko::size_type num_send_entries = + send_offsets.get_data()[num_parts]; + gko::array send_row_idxs{ref, num_send_entries}; + gko::array send_col_idxs{ref, num_send_entries}; + gko::array send_values{ref, num_send_entries}; + gko::array d_send_row_idxs{exec, + num_send_entries}; + gko::array d_send_col_idxs{exec, + num_send_entries}; + gko::array d_send_values{exec, num_send_entries}; + + gko::kernels::reference::distributed_matrix::fill_send_buffers( + ref, input, row_partition.get(), part, send_positions, + original_positions, send_row_idxs, send_col_idxs, send_values); + gko::kernels::GKO_DEVICE_NAMESPACE::distributed_matrix:: + fill_send_buffers(exec, d_input, d_row_partition.get(), part, + d_send_positions, d_original_positions, + d_send_row_idxs, d_send_col_idxs, + d_send_values); gko::kernels::reference::distributed_matrix:: separate_local_nonlocal( @@ -75,6 +120,12 @@ class Matrix : public CommonTestFixture { d_non_local_row_idxs, d_non_local_col_idxs, d_non_local_values); + GKO_ASSERT_ARRAY_EQ(send_positions, d_send_positions); + GKO_ASSERT_ARRAY_EQ(original_positions, d_original_positions); + GKO_ASSERT_ARRAY_EQ(send_count, d_send_count); + GKO_ASSERT_ARRAY_EQ(send_row_idxs, d_send_row_idxs); + GKO_ASSERT_ARRAY_EQ(send_col_idxs, d_send_col_idxs); + GKO_ASSERT_ARRAY_EQ(send_values, d_send_values); GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs); GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs); GKO_ASSERT_ARRAY_EQ(local_values, d_local_values); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index f4b8af2fb19..0cfb3aca477 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -60,9 +60,16 @@ class MatrixCreation : public CommonMpiTestFixture { {3, 4, 7}, {4, 0, 9}, {4, 4, 10}}}, - dist_input{{{size, {{0, 1, 1}, {0, 3, 2}, {1, 1, 3}, {1, 2, 4}}}, - {size, {{2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}}, - {size, {{4, 0, 9}, {4, 4, 10}}}}}, + dist_input{ + {{size, + {{0, 1, 1}, + {0, 3, 2}, + {1, 1, 3}, + {1, 2, 4}, + {2, 0, 1}, + {2, 3, 1}}}, + {size, {{0, 0, 1}, {2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}}, + {size, {{2, 2, 1}, {3, 3, -1}, {4, 0, 9}, {4, 4, 10}}}}}, engine(42) { row_part = Partition::build_from_contiguous( @@ -134,6 +141,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedLocalData) } +TYPED_TEST(MatrixCreation, ReadsDistributedLocalDataWithCommunicate) +{ + using value_type = typename TestFixture::value_type; + using csr = typename TestFixture::local_matrix_type; + I> res_local[] = {{{1, 1}, {0, 3}}, {{7, 1}, {0, 7}}, {{10}}}; + I> res_non_local[] = { + {{0, 2}, {4, 0}}, {{1, 5, 0}, {0, 0, 7}}, {{9}}}; + auto rank = this->dist_mat->get_communicator().rank(); + + this->dist_mat->read_distributed( + this->dist_input[rank], this->row_part, + gko::experimental::distributed::assembly_mode::communicate); + + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_non_local[rank], 0); +} + + TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition) { using value_type = typename TestFixture::value_type; @@ -153,6 +180,26 @@ TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition) } +TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartitionAndCommunicate) +{ + using value_type = typename TestFixture::value_type; + using csr = typename TestFixture::local_matrix_type; + I> res_local[] = {{{2, 0}, {0, 0}}, {{1, 5}, {0, 0}}, {{0}}}; + I> res_non_local[] = { + {{1, 1, 0}, {0, 3, 4}}, {{1, 0, 7}, {7, 7, 0}}, {{10, 9}}}; + auto rank = this->dist_mat->get_communicator().rank(); + + this->dist_mat->read_distributed( + this->dist_input[rank], this->row_part, this->col_part, + gko::experimental::distributed::assembly_mode::communicate); + + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_local_matrix()), + res_local[rank], 0); + GKO_ASSERT_MTX_NEAR(gko::as(this->dist_mat->get_non_local_matrix()), + res_non_local[rank], 0); +} + + TYPED_TEST(MatrixCreation, BuildOnlyLocal) { using value_type = typename TestFixture::value_type;