diff --git a/.github/workflows/windows-mingw.yml b/.github/workflows/windows-mingw.yml index fc1906c982d..d70b94b8b28 100644 --- a/.github/workflows/windows-mingw.yml +++ b/.github/workflows/windows-mingw.yml @@ -50,7 +50,7 @@ jobs: mkdir build cd build cmake -G "MinGW Makefiles" -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_BUILD_TYPE=${{ matrix.config.build_type }} -DGINKGO_COMPILER_FLAGS=${{ matrix.config.cflags }} .. - cmake --build . -j4 + cmake --build . -j2 shell: cmd - name: install diff --git a/.github/workflows/windows-msvc-ref.yml b/.github/workflows/windows-msvc-ref.yml index f7d73e2fd82..a5be64c9daa 100644 --- a/.github/workflows/windows-msvc-ref.yml +++ b/.github/workflows/windows-msvc-ref.yml @@ -27,8 +27,9 @@ jobs: fail-fast: false matrix: config: - - {shared: "ON", build_type: "Debug", name: "reference/debug/shared"} - - {shared: "OFF", build_type: "Release", name: "reference/release/static"} + # Debug with half precision has the issue "library limit of 65535 objects exceeded" + - {shared: "ON", build_type: "Debug", name: "reference/debug/shared", half: "OFF"} + - {shared: "OFF", build_type: "Release", name: "reference/release/static", half: "ON"} # Debug static needs too much storage # - {shared: "OFF", build_type: "Debug", name: "reference/debug/static"} name: msvc/${{ matrix.config.name }} @@ -47,7 +48,7 @@ jobs: run: | mkdir build cd build - cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG='/MDd /Zi /Ob1 /O1 /Od /RTC1' -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF .. + cmake -DBUILD_SHARED_LIBS=${{ matrix.config.shared }} -DCMAKE_CXX_FLAGS_DEBUG='/MDd /Zi /Ob1 /O1 /Od /RTC1' -DGINKGO_BUILD_CUDA=OFF -DGINKGO_BUILD_OMP=OFF -DGINKGO_ENABLE_HALF=${{ matrix.config.half }}.. cmake --build . -j4 --config ${{ matrix.config.build_type }} ctest . -C ${{ matrix.config.build_type }} --output-on-failure diff --git a/CMakeLists.txt b/CMakeLists.txt index efb3fcc24ff..2d8d65e354c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) +option(GINKGO_ENABLE_HALF "Enable the half operation" ON) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_EXPORT_BUILD_DIR diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index 30af6b24777..0167ee0f9c4 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -47,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "utils.hpp" +struct __half; + + namespace gko { namespace acc { namespace detail { @@ -57,6 +60,16 @@ struct cuda_type { using type = T; }; +template <> +struct cuda_type { + using type = __half; +}; + +template <> +struct cuda_type { + using type = __nv_bfloat16; +}; + // Unpack cv and reference / pointer qualifiers template struct cuda_type { @@ -87,7 +100,7 @@ struct cuda_type { // Transform std::complex to thrust::complex template struct cuda_type> { - using type = thrust::complex; + using type = thrust::complex::type>; }; diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp index 9848b4360f8..4b1054907ba 100644 --- a/accessor/hip_helper.hpp +++ b/accessor/hip_helper.hpp @@ -47,6 +47,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "utils.hpp" +struct __half; + + namespace gko { namespace acc { namespace detail { @@ -83,11 +86,20 @@ struct hip_type { using type = typename hip_type::type&&; }; +template <> +struct hip_type { + using type = __half; +}; + +template <> +struct hip_type { + using type = hip_bfloat16; +}; // Transform std::complex to thrust::complex template struct hip_type> { - using type = thrust::complex; + using type = thrust::complex::type>; }; diff --git a/accessor/reduced_row_major_reference.hpp b/accessor/reduced_row_major_reference.hpp index 10960316eb2..59122160835 100644 --- a/accessor/reduced_row_major_reference.hpp +++ b/accessor/reduced_row_major_reference.hpp @@ -103,7 +103,7 @@ class reduced_storage operator=(arithmetic_type val) && { storage_type* const GKO_ACC_RESTRICT r_ptr = ptr_; - *r_ptr = val; + *r_ptr = detail::implicit_explicit_conversion(val); return val; } @@ -115,7 +115,8 @@ class reduced_storage } constexpr GKO_ACC_ATTRIBUTES arithmetic_type - operator=(reduced_storage&& ref) && noexcept + operator=(reduced_storage&& ref) && + noexcept { std::move(*this) = ref.implicit_conversion(); return *this; diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp index 40dc4bebaf2..18d55712130 100644 --- a/accessor/reference_helper.hpp +++ b/accessor/reference_helper.hpp @@ -43,8 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // CUDA TOOLKIT < 11 does not support constexpr in combination with // thrust::complex, which is why constexpr is only present in later versions -#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ - (__CUDACC_VER_MAJOR__ < 11) +// TODO: NVC++ constexpr +#if (defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ + (__CUDACC_VER_MAJOR__ < 11)) || \ + (defined(__NVCOMPILER) && GINKGO_ENABLE_HALF) #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index fd04620f595..61ff7aeb557 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -24,15 +24,12 @@ function(ginkgo_benchmark_cusparse_linops type def) endfunction() function(ginkgo_benchmark_hipsparse_linops type def) - add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) + set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) + hip_add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp + HIPCC_OPTIONS ${GINKGO_HIPCC_OPTIONS} -D${def} + CLANG_OPTIONS ${GINKGO_HIP_CLANG_OPTIONS} + NVCC_OPTIONS ${GINKGO_HIP_NVCC_OPTIONS}) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) - EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) - set_target_properties(hipsparse_linops_${type} PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) - # use Thrust C++ device just for compilation, we don't use thrust::complex in the benchmarks - target_compile_definitions(hipsparse_linops_${type} PUBLIC -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP) - target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE - ${HSA_HEADER} ${HIP_INCLUDE_DIRS} - ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS}) target_link_libraries(hipsparse_linops_${type} Ginkgo::ginkgo ${HIPSPARSE_LIBRARIES}) endfunction() @@ -79,17 +76,25 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_options("${name}" PRIVATE ${GINKGO_COMPILER_FLAGS}) ginkgo_benchmark_add_tuning_maybe("${name}") if("${use_lib_linops}") - if (GINKGO_BUILD_CUDA) - target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) - target_link_libraries("${name}" cusparse_linops_${type}) - endif() - if (GINKGO_BUILD_HIP) - target_compile_definitions("${name}" PRIVATE HAS_HIP=1) - target_link_libraries("${name}" hipsparse_linops_${type}) - endif() - if (GINKGO_BUILD_DPCPP) - target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) - target_link_libraries("${name}" onemkl_linops_${type}) + if ("${type}" STREQUAL "h") + # only cuda supports half currently + if (GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + else() + if (GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + if (GINKGO_BUILD_HIP) + target_compile_definitions("${name}" PRIVATE HAS_HIP=1) + target_link_libraries("${name}" hipsparse_linops_${type}) + endif() + if (GINKGO_BUILD_DPCPP) + target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) + target_link_libraries("${name}" onemkl_linops_${type}) + endif() endif() endif() endfunction(ginkgo_add_single_benchmark_executable) @@ -119,6 +124,9 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + if (GINKGO_ENABLE_HALF) + ginkgo_benchmark_cusparse_linops(h GKO_BENCHMARK_USE_HALF_PRECISION) + endif() add_library(cuda_timer utils/cuda_timer.cpp) target_link_libraries(cuda_timer ginkgo CUDA::cudart) endif() diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index 2a614a87904..0efc0f0b3c2 100755 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -110,6 +110,8 @@ elif [ "${BENCHMARK_PRECISION}" == "dcomplex" ]; then BENCH_SUFFIX="_dcomplex" elif [ "${BENCHMARK_PRECISION}" == "scomplex" ]; then BENCH_SUFFIX="_scomplex" +elif [ "${BENCHMARK_PRECISION}" == "half" ]; then + BENCH_SUFFIX="_half" else echo "BENCHMARK_PRECISION is set to the not supported \"${BENCHMARK_PRECISION}\"." 1>&2 echo "Currently supported values: \"double\", \"single\", \"dcomplex\" and \"scomplex\"" 1>&2 @@ -216,9 +218,16 @@ keep_latest() { compute_matrix_statistics() { [ "${DRY_RUN}" == "true" ] && return cp "$1" "$1.imd" # make sure we're not loosing the original input - ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \ - --backup="$1.bkp" --double_buffer="$1.bkp2" \ - <"$1.imd" 2>&1 >"$1" + if [ "${BENCH_SUFFIX}" == "_half" ]; then + # half precision benchmark still uses single for statistics + ./matrix_statistics/matrix_statistics_single \ + --backup="$1.bkp" --double_buffer="$1.bkp2" \ + <"$1.imd" 2>&1 >"$1" + else + ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \ + --backup="$1.bkp" --double_buffer="$1.bkp2" \ + <"$1.imd" 2>&1 >"$1" + fi keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt index 1e3bab1c884..0165d96a264 100644 --- a/benchmark/spmv/CMakeLists.txt +++ b/benchmark/spmv/CMakeLists.txt @@ -1,4 +1,9 @@ ginkgo_add_typed_benchmark_executables(spmv "YES" spmv.cpp) +# TODO: move to all benchmark +if (GINKGO_ENABLE_HALF) + ginkgo_add_single_benchmark_executable( + "spmv_half" "YES" "GKO_BENCHMARK_USE_HALF_PRECISION" "h" spmv.cpp) +endif() if(GINKGO_BUILD_MPI) add_subdirectory(distributed) endif() diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index c85642bb5f1..1f8bf590703 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -144,7 +144,9 @@ struct SpmvBenchmark : Benchmark> { exec->synchronize(); auto max_relative_norm2 = compute_max_relative_norm2(x_clone.get(), state.answer.get()); - format_case["max_relative_norm2"] = max_relative_norm2; + format_case["max_relative_norm2"] = + static_cast::type>( + max_relative_norm2); } IterationControl ic{timer}; diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index e2221614d9c..77c8d1f2f5c 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -558,14 +558,19 @@ class CusparseHybrid ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))) +// cuSPARSE does not support 16 bit compute for full 16 bit floating point +// input. Also, the scalar must be the compute type, i.e. float. template -void cusparse_generic_spmv(std::shared_ptr gpu_exec, - const cusparseSpMatDescr_t mat, - const gko::array& scalars, - const gko::LinOp* b, gko::LinOp* x, - cusparseOperation_t trans, cusparseSpMVAlg_t alg) +void cusparse_generic_spmv( + std::shared_ptr gpu_exec, + const cusparseSpMatDescr_t mat, + const gko::array::type>& scalars, + const gko::LinOp* b, gko::LinOp* x, cusparseOperation_t trans, + cusparseSpMVAlg_t alg) { cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + cudaDataType_t compute_value = gko::kernels::cuda::cuda_data_type< + typename gko::detail::arth_type::type>(); using gko::kernels::cuda::as_culibs_type; auto dense_b = gko::as>(b); auto dense_x = gko::as>(x); @@ -584,13 +589,14 @@ void cusparse_generic_spmv(std::shared_ptr gpu_exec, gko::size_type buffer_size = 0; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], - mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, + mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg, &buffer_size)); gko::array buffer_array(gpu_exec, buffer_size); auto dbuffer = buffer_array.get_data(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV( gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], - mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer)); + mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg, + dbuffer)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb)); } @@ -669,8 +675,8 @@ class CusparseGenericCsr protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - Alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, + x, trans_, Alg); } void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, @@ -686,9 +692,11 @@ class CusparseGenericCsr {} private: + using compute_type = typename gko::detail::arth_type::type; // Contains {alpha, beta} - gko::array scalars{ - this->get_executor(), {gko::one(), gko::zero()}}; + gko::array scalars{ + this->get_executor(), + {gko::one(), gko::zero()}}; std::shared_ptr csr_; cusparseOperation_t trans_; cusparseSpMatDescr_t mat_; @@ -761,8 +769,8 @@ class CusparseGenericCoo protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - default_csr_alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, + x, trans_, default_csr_alg); } void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, @@ -777,9 +785,11 @@ class CusparseGenericCoo {} private: + using compute_type = typename gko::detail::arth_type::type; // Contains {alpha, beta} - gko::array scalars{ - this->get_executor(), {gko::one(), gko::zero()}}; + gko::array scalars{ + this->get_executor(), + {gko::one(), gko::zero()}}; std::shared_ptr coo_; cusparseOperation_t trans_; cusparseSpMatDescr_t mat_; diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 3f26ed3f2fc..ad39ac28d85 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -158,10 +158,7 @@ struct DefaultSystemGenerator { { auto res = Vec::create(exec); res->read(gko::matrix_data( - size, - std::uniform_real_distribution>(-1.0, - 1.0), - get_engine())); + size, std::uniform_real_distribution<>(-1.0, 1.0), get_engine())); return res; } diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp index 6ac57ad23c2..fa79bea3801 100644 --- a/benchmark/utils/types.hpp +++ b/benchmark/utils/types.hpp @@ -46,7 +46,8 @@ using itype = gko::int32; #if defined(GKO_BENCHMARK_USE_DOUBLE_PRECISION) || \ defined(GKO_BENCHMARK_USE_SINGLE_PRECISION) || \ defined(GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) || \ - defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) || \ + defined(GKO_BENCHMARK_USE_HALF_PRECISION) // separate ifdefs to catch duplicate definitions #ifdef GKO_BENCHMARK_USE_DOUBLE_PRECISION using etype = double; @@ -60,6 +61,10 @@ using etype = std::complex; #ifdef GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION using etype = std::complex; #endif +#ifdef GKO_BENCHMARK_USE_HALF_PRECISION +#include +using etype = gko::half; +#endif #else // default to double precision using etype = double; #endif @@ -67,4 +72,33 @@ using etype = double; using rc_etype = gko::remove_complex; +namespace detail { + + +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + +} // namespace detail + +template +using next_precision = typename detail::next_precision_impl::type; + #endif // GKO_BENCHMARK_UTILS_TYPES_HPP_ diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 2dd068abb50..97f6a2dd602 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -205,6 +205,8 @@ if(TARGET hwloc) ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") endif() +ginkgo_print_variable(${minimal_log} "GINKGO_ENABLE_HALF") +ginkgo_print_variable(${detailed_log} "GINKGO_ENABLE_HALF") _minimal( " diff --git a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc index faf0ad15146..6046ef07b2b 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc +++ b/common/cuda_hip/base/device_matrix_data_kernels.hpp.inc @@ -39,9 +39,13 @@ void remove_zeros(std::shared_ptr exec, auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_num_elems(); // count nonzeros - auto nnz = thrust::count_if( - thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { return is_nonzero(value); }); + // __half != is only device, can not call __device__ from a __host__ + // __device__ (is_nonzero) + auto nnz = + thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { + return value != zero(value); + }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -57,7 +61,8 @@ void remove_zeros(std::shared_ptr exec, as_device_type(new_values.get_data()))); thrust::copy_if(thrust_policy(exec), it, it + size, out_it, [] __device__(tuple_type entry) { - return is_nonzero(thrust::get<2>(entry)); + return thrust::get<2>(entry) != + zero(thrust::get<2>(entry)); }); // swap out storage values = std::move(new_values); diff --git a/common/cuda_hip/base/math.hpp.inc b/common/cuda_hip/base/math.hpp.inc index 583dd01ef93..bea153dbeda 100644 --- a/common/cuda_hip/base/math.hpp.inc +++ b/common/cuda_hip/base/math.hpp.inc @@ -39,23 +39,36 @@ struct device_numeric_limits { static constexpr auto min = std::numeric_limits::min(); }; - -namespace detail { +template <> +struct device_numeric_limits<__half> { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; -template -struct remove_complex_impl> { - using type = T; +template <> +struct device_numeric_limits<__nv_bfloat16> { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); }; -template -struct is_complex_impl> - : public std::integral_constant {}; +template <> +struct device_numeric_limits { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + +namespace detail { template -struct is_complex_or_scalar_impl> : std::is_scalar {}; +struct remove_complex_impl> { + using type = T; +}; template diff --git a/common/cuda_hip/components/atomic.hpp.inc b/common/cuda_hip/components/atomic.hpp.inc index 24bce48a720..5e6e90976ac 100644 --- a/common/cuda_hip/components/atomic.hpp.inc +++ b/common/cuda_hip/components/atomic.hpp.inc @@ -110,15 +110,65 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) } \ }; + +#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE) \ + template \ + struct atomic_helper< \ + ValueType, \ + std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ + __forceinline__ __device__ static ValueType atomic_add( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + assert(false); \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + old = *c_addr; \ + *c_addr = reinterpret( \ + val + reinterpret(assumed)); \ + }); \ + } \ + __forceinline__ __device__ static ValueType atomic_max( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + assert(false); \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + if (reinterpret(assumed) < val) { \ + old = *c_addr; \ + *c_addr = reinterpret(assumed); \ + } \ + }); \ + } \ + \ + private: \ + template \ + __forceinline__ __device__ static ValueType atomic_wrapper( \ + ValueType* __restrict__ addr, Callable set_old) \ + { \ + CONVERTER_TYPE* address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed = old; \ + set_old(old, assumed, address_as_converter); \ + return reinterpret(old); \ + } \ + }; + // Support 64-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); -#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) +#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) && \ + (__CUDA_ARCH__ >= 700) && !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS +// required the CC>=70 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); +#else +GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) #endif // !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE diff --git a/common/cuda_hip/components/reduction.hpp.inc b/common/cuda_hip/components/reduction.hpp.inc index 39b3d3ffb37..9c3d3e4e014 100644 --- a/common/cuda_hip/components/reduction.hpp.inc +++ b/common/cuda_hip/components/reduction.hpp.inc @@ -75,7 +75,7 @@ __device__ __forceinline__ int choose_pivot(const Group& group, bool is_pivoted) { using real = remove_complex; - real lmag = is_pivoted ? -one() : abs(local_data); + real lmag = real(is_pivoted ? -one() : abs(local_data)); const auto pivot = reduce(group, group.thread_rank(), [&](int lidx, int ridx) { const auto rmag = group.shfl(lmag, ridx); diff --git a/common/cuda_hip/components/volatile.hpp.inc b/common/cuda_hip/components/volatile.hpp.inc index 402f73f088e..75b586a8508 100644 --- a/common/cuda_hip/components/volatile.hpp.inc +++ b/common/cuda_hip/components/volatile.hpp.inc @@ -40,9 +40,13 @@ __device__ __forceinline__ } template -__device__ __forceinline__ std::enable_if_t< - std::is_floating_point::value, thrust::complex> -load(const thrust::complex* values, IndexType index) +__device__ __forceinline__ + std::enable_if_t::value || + std::is_same::value || + std::is_same::value || + std::is_same::value, + thrust::complex> + load(const thrust::complex* values, IndexType index) { auto real = reinterpret_cast(values); auto imag = real + 1; diff --git a/common/cuda_hip/components/warp_blas.hpp.inc b/common/cuda_hip/components/warp_blas.hpp.inc index 8869dae3e3b..40e1af7a500 100644 --- a/common/cuda_hip/components/warp_blas.hpp.inc +++ b/common/cuda_hip/components/warp_blas.hpp.inc @@ -69,7 +69,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform( if (group.thread_rank() == key_row) { key_col_elem = one() / key_col_elem; } else { - key_col_elem = -row[key_col] / key_col_elem; + key_col_elem = zero() - row[key_col] / key_col_elem; } #pragma unroll for (int32 i = 0; i < max_problem_size; ++i) { @@ -115,7 +115,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform_with_rhs( key_col_elem = one() / key_col_elem; rhs[0] = key_rhs_elem * key_col_elem; } else { - key_col_elem = -row[key_col] / key_col_elem; + key_col_elem = zero() - row[key_col] / key_col_elem; rhs[0] += key_rhs_elem * key_col_elem; } #pragma unroll @@ -435,5 +435,5 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( } } return reduce(group, sum, - [](result_type x, result_type y) { return max(x, y); }); + [](result_type x, result_type y) { return gko::max(x, y); }); } diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.hpp.inc index 0fa7afab859..47c0aab04e4 100644 --- a/common/cuda_hip/distributed/matrix_kernels.hpp.inc +++ b/common/cuda_hip/distributed/matrix_kernels.hpp.inc @@ -138,11 +138,11 @@ void build_local_nonlocal( col_range_starting_indices[range_id]; }; - using input_type = input_type; + using input_type = input_type, GlobalIndexType>; auto input_it = thrust::make_zip_iterator(thrust::make_tuple( input.get_const_row_idxs(), input.get_const_col_idxs(), - input.get_const_values(), row_range_ids.get_const_data(), - col_range_ids.get_const_data())); + as_device_type(input.get_const_values()), + row_range_ids.get_const_data(), col_range_ids.get_const_data())); // copy and transform local entries into arrays local_row_idxs.resize_and_reset(num_local_elements); @@ -158,9 +158,9 @@ void build_local_nonlocal( thrust::copy_if( thrust_policy(exec), local_it, local_it + input.get_num_elems(), range_ids_it, - thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(), - local_col_idxs.get_data(), - local_values.get_data())), + thrust::make_zip_iterator(thrust::make_tuple( + local_row_idxs.get_data(), local_col_idxs.get_data(), + as_device_type(local_values.get_data()))), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; @@ -189,7 +189,8 @@ void build_local_nonlocal( range_ids_it, thrust::make_zip_iterator(thrust::make_tuple( non_local_row_idxs.get_data(), non_local_global_col_idxs.get_data(), - non_local_values.get_data(), non_local_col_part_ids.get_data(), + as_device_type(non_local_values.get_data()), + non_local_col_part_ids.get_data(), non_local_col_range_ids.get_data())), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc index 2f73d731a69..8adc9329826 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp.inc @@ -269,7 +269,7 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( __shared__ ValueType sh_local[basecase_size]; for (int i = 0; i < basecase_local_size; ++i) { auto idx = threadIdx.x + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local); if (threadIdx.x == rank / basecase_local_size) { diff --git a/common/cuda_hip/matrix/csr_kernels.hpp.inc b/common/cuda_hip/matrix/csr_kernels.hpp.inc index 3f02337747e..a6f6269a0b4 100644 --- a/common/cuda_hip/matrix/csr_kernels.hpp.inc +++ b/common/cuda_hip/matrix/csr_kernels.hpp.inc @@ -222,7 +222,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const arithmetic_type scale_factor = static_cast(alpha[0]); spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { return static_cast(scale_factor * x); @@ -430,7 +430,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_reduce( const IndexType* __restrict__ last_row, const MatrixValueType* __restrict__ alpha, acc::range c) { - const arithmetic_type alpha_val = alpha[0]; + const arithmetic_type alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }); diff --git a/common/cuda_hip/matrix/ell_kernels.hpp.inc b/common/cuda_hip/matrix/ell_kernels.hpp.inc index 6c81fb4964c..4c0a46f2193 100644 --- a/common/cuda_hip/matrix/ell_kernels.hpp.inc +++ b/common/cuda_hip/matrix/ell_kernels.hpp.inc @@ -43,13 +43,14 @@ __device__ void spmv_kernel( acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(); const decltype(tidx) column_id = blockIdx.y; if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -69,13 +70,13 @@ __device__ void spmv_kernel( const auto worker_id = tidx / num_rows; const auto step_size = num_worker_per_row * num_thread_per_worker; __shared__ uninitialized_array< - OutputValueType, default_block_size / num_thread_per_worker> + arithmetic_type, default_block_size / num_thread_per_worker> storage; if (idx_in_worker == 0) { - storage[threadIdx.x] = 0; + storage[threadIdx.x] = gko::zero(); } __syncthreads(); - auto temp = zero(); + auto temp = zero(); for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { @@ -114,7 +115,9 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }); + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }); } @@ -128,7 +131,8 @@ __global__ __launch_bounds__(default_block_size) void spmv( const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -139,16 +143,16 @@ __global__ __launch_bounds__(default_block_size) void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }); } } diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.hpp.inc index ce46925ef58..5ac21caf318 100644 --- a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc +++ b/common/cuda_hip/preconditioner/isai_kernels.hpp.inc @@ -290,8 +290,9 @@ __global__ __launch_bounds__(default_block_size) void generate_general_inverse( ValueType sol = subwarp.shfl(rhs, perm); if (spd) { - auto diag = subwarp.shfl(sol, num_elems - 1); - sol /= sqrt(diag); + ValueType diag = subwarp.shfl(sol, num_elems - 1); + // TODO: check why HIP bfloat16 return float + sol /= static_cast(sqrt(diag)); } return sol; diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc index d5b9fb85551..808dffabfd4 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernel.hpp.inc @@ -72,7 +72,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( } } - return succeeded && block_cond >= 1.0 && + return succeeded && block_cond >= remove_complex{1.0} && block_cond * static_cast>( float_traits>::eps) < remove_complex{1e-3}; diff --git a/common/cuda_hip/solver/multigrid_kernels.hpp.inc b/common/cuda_hip/solver/multigrid_kernels.hpp.inc index 472187314a6..12ef1d6efb6 100644 --- a/common/cuda_hip/solver/multigrid_kernels.hpp.inc +++ b/common/cuda_hip/solver/multigrid_kernels.hpp.inc @@ -191,8 +191,8 @@ void kcycle_check_stop(std::shared_ptr exec, kernel::kcycle_check_stop_kernel<<get_stream()>>>( nrhs, as_device_type(old_norm->get_const_values()), - as_device_type(new_norm->get_const_values()), rel_tol, - as_device_type(dis_stop.get_data())); + as_device_type(new_norm->get_const_values()), + as_device_type(rel_tol), as_device_type(dis_stop.get_data())); } is_stop = exec->copy_val_to_host(dis_stop.get_const_data()); } diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 457d3d368e7..47be03f6951 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" - namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { @@ -60,9 +59,21 @@ template void fill_seq_array(std::shared_ptr exec, ValueType* array, size_type n) { + // __half only has long long not int64_t run_kernel( - exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = idx; }, n, - array); + exec, + [] GKO_KERNEL(auto idx, auto array) { + // hip bfloat16 does not provide implicit conversion + array[idx] = static_cast>, + __nv_bfloat16>::value, + float, + typename std::conditional< + std::is_same>, + hip_bfloat16>::value, + hip_bfloat16, long long>::type>::type>(idx); + }, + n, array); } GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); diff --git a/common/unified/components/precision_conversion_kernels.cpp b/common/unified/components/precision_conversion_kernels.cpp index 47ddf7bfc92..df1cd9fa062 100644 --- a/common/unified/components/precision_conversion_kernels.cpp +++ b/common/unified/components/precision_conversion_kernels.cpp @@ -36,6 +36,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include +#include + namespace gko { namespace kernels { namespace GKO_DEVICE_NAMESPACE { @@ -48,7 +51,14 @@ void convert_precision(std::shared_ptr exec, { run_kernel( exec, - [] GKO_KERNEL(auto idx, auto in, auto out) { out[idx] = in[idx]; }, + [] GKO_KERNEL(auto idx, auto in, auto out) { + using target_type = device_type; + using arithmetic_type = + highest_precision>; + // use float as the bridge between bfloat16 and half on device + out[idx] = + static_cast(static_cast(in[idx])); + }, size, in, out); } diff --git a/common/unified/matrix/dense_kernels.template.cpp b/common/unified/matrix/dense_kernels.template.cpp index b6ed5fb37e0..81d7543d79c 100644 --- a/common/unified/matrix/dense_kernels.template.cpp +++ b/common/unified/matrix/dense_kernels.template.cpp @@ -62,7 +62,11 @@ void copy(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto input, auto output) { - output(row, col) = input(row, col); + using type = device_type; + using arithmetic_type = + highest_precision>; + output(row, col) = static_cast( + static_cast(input(row, col))); }, input->get_size(), input, output); } @@ -404,7 +408,11 @@ void row_gather(std::shared_ptr exec, run_kernel( exec, [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { - gathered(row, col) = orig(rows[row], col); + using output_type = device_type; + using arithmetic_type = + highest_precision>; + gathered(row, col) = static_cast( + static_cast(orig(rows[row], col))); }, dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, row_collection); @@ -424,10 +432,10 @@ void advanced_row_gather(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows, auto beta, auto gathered) { using type = device_type>; - gathered(row, col) = + gathered(row, col) = static_cast>( static_cast(alpha[0] * orig(rows[row], col)) + static_cast(beta[0]) * - static_cast(gathered(row, col)); + static_cast(gathered(row, col))); }, dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index a61b32dacbd..82b8853e4b3 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -214,7 +214,7 @@ void find_strongest_neighbor( continue; } auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + weight_vals[idx] / gko::max(abs(diag[row]), abs(diag[col])); if (agg[col] == -1 && device_std::tie(weight, col) > device_std::tie(max_weight_unagg, strongest_unagg)) { @@ -266,6 +266,7 @@ void assign_to_exist_agg(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg_const_val, auto agg_val) { + using value_type = device_type; if (agg_val[row] != -1) { return; } @@ -277,8 +278,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_const_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { @@ -304,6 +305,7 @@ void assign_to_exist_agg(std::shared_ptr exec, exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg_val) { + using value_type = device_type; if (agg_val[row] != -1) { return; } @@ -315,8 +317,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { diff --git a/common/unified/solver/common_gmres_kernels.cpp b/common/unified/solver/common_gmres_kernels.cpp index 94646cc477f..7c00df081f9 100644 --- a/common/unified/solver/common_gmres_kernels.cpp +++ b/common/unified/solver/common_gmres_kernels.cpp @@ -117,7 +117,7 @@ void hessenberg_qr(std::shared_ptr exec, const auto gc = givens_cos(j, rhs); const auto gs = givens_sin(j, rhs); const auto out1 = gc * hess_this + gs * hess_next; - const auto out2 = -conj(gs) * hess_this + conj(gc) * hess_next; + const auto out2 = conj(gc) * hess_next - conj(gs) * hess_this; hessenberg_iter(j, rhs) = out1; hessenberg_iter(j + 1, rhs) = hess_this = out2; hess_next = hessenberg_iter(j + 2, rhs); @@ -143,8 +143,8 @@ void hessenberg_qr(std::shared_ptr exec, hessenberg_iter(iter, rhs) = gc * hess_this + gs * hess_next; hessenberg_iter(iter + 1, rhs) = zero(); // apply new Givens rotation to RHS of least-squares problem - const auto rnc_new = - -conj(gs) * residual_norm_collection(iter, rhs); + const auto rnc_new = zero() - + conj(gs) * residual_norm_collection(iter, rhs); residual_norm_collection(iter + 1, rhs) = rnc_new; residual_norm_collection(iter, rhs) = gc * residual_norm_collection(iter, rhs); diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index 23591cd1ffe..5dc119c71e5 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -290,6 +290,42 @@ void MultiVector::move_to( } +#if GINKGO_ENABLE_HALF +template +void MultiVector::convert_to( + MultiVector>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void MultiVector::move_to( + MultiVector>>* result) +{ + this->convert_to(result); +} +#endif + + +template +void MultiVector::convert_to( + MultiVector>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void MultiVector::move_to( + MultiVector>* result) +{ + this->convert_to(result); +} + + #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/core/base/device_matrix_data_kernels.hpp b/core/base/device_matrix_data_kernels.hpp index 2ab06dec3ec..e942362934d 100644 --- a/core/base/device_matrix_data_kernels.hpp +++ b/core/base/device_matrix_data_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 8f6ee2b0cb9..fb01cfe7fe0 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -54,347 +55,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#else +class __half; #endif // __CUDA_ARCH__ namespace gko { -template -class truncated; - - -namespace detail { - - -template -struct uint_of_impl {}; - -template -struct uint_of_impl> { - using type = uint16; -}; - -template -struct uint_of_impl> { - using type = uint32; -}; - -template -struct uint_of_impl> { - using type = uint64; -}; - -template -using uint_of = typename uint_of_impl::type; - - -template -struct basic_float_traits {}; - -template <> -struct basic_float_traits { - using type = float16; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; - -template <> -struct basic_float_traits { - using type = float32; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 23; - static constexpr int exponent_bits = 8; - static constexpr bool rounds_to_nearest = true; -}; - -template <> -struct basic_float_traits { - using type = float64; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 52; - static constexpr int exponent_bits = 11; - static constexpr bool rounds_to_nearest = true; -}; - -template -struct basic_float_traits> { - using type = truncated; - static constexpr int sign_bits = ComponentId == 0 ? 1 : 0; - static constexpr int exponent_bits = - ComponentId == 0 ? basic_float_traits::exponent_bits : 0; - static constexpr int significand_bits = - ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1 - : sizeof(type) * byte_size; - static constexpr bool rounds_to_nearest = false; -}; - - -template -constexpr UintType create_ones(int n) -{ - return (n == sizeof(UintType) * byte_size ? static_cast(0) - : static_cast(1) << n) - - static_cast(1); -} - -template -struct float_traits { - using type = typename basic_float_traits::type; - using bits_type = uint_of; - static constexpr int sign_bits = basic_float_traits::sign_bits; - static constexpr int significand_bits = - basic_float_traits::significand_bits; - static constexpr int exponent_bits = basic_float_traits::exponent_bits; - static constexpr bits_type significand_mask = - create_ones(significand_bits); - static constexpr bits_type exponent_mask = - create_ones(significand_bits + exponent_bits) - - significand_mask; - static constexpr bits_type bias_mask = - create_ones(significand_bits + exponent_bits - 1) - - significand_mask; - static constexpr bits_type sign_mask = - create_ones(sign_bits + significand_bits + exponent_bits) - - exponent_mask - significand_mask; - static constexpr bool rounds_to_nearest = - basic_float_traits::rounds_to_nearest; - - static constexpr auto eps = - 1.0 / (1ll << (significand_bits + rounds_to_nearest)); - - static constexpr bool is_inf(bits_type data) - { - return (data & exponent_mask) == exponent_mask && - (data & significand_mask) == bits_type{}; - } - - static constexpr bool is_nan(bits_type data) - { - return (data & exponent_mask) == exponent_mask && - (data & significand_mask) != bits_type{}; - } - - static constexpr bool is_denom(bits_type data) - { - return (data & exponent_mask) == bits_type{}; - } -}; - - -template -struct precision_converter; - -// upcasting implementation details -template -struct precision_converter { - using source_traits = float_traits; - using result_traits = float_traits; - using source_bits = typename source_traits::bits_type; - using result_bits = typename result_traits::bits_type; - - static_assert(source_traits::exponent_bits <= - result_traits::exponent_bits && - source_traits::significand_bits <= - result_traits::significand_bits, - "SourceType has to have both lower range and precision or " - "higher range and precision than ResultType"); - - static constexpr int significand_offset = - result_traits::significand_bits - source_traits::significand_bits; - static constexpr int exponent_offset = significand_offset; - static constexpr int sign_offset = result_traits::exponent_bits - - source_traits::exponent_bits + - exponent_offset; - static constexpr result_bits bias_change = - result_traits::bias_mask - - (static_cast(source_traits::bias_mask) << exponent_offset); - - static constexpr result_bits shift_significand(source_bits data) noexcept - { - return static_cast(data & source_traits::significand_mask) - << significand_offset; - } - - static constexpr result_bits shift_exponent(source_bits data) noexcept - { - return update_bias( - static_cast(data & source_traits::exponent_mask) - << exponent_offset); - } - - static constexpr result_bits shift_sign(source_bits data) noexcept - { - return static_cast(data & source_traits::sign_mask) - << sign_offset; - } - -private: - static constexpr result_bits update_bias(result_bits data) noexcept - { - return data == typename result_traits::bits_type{} ? data - : data + bias_change; - } -}; - -// downcasting implementation details -template -struct precision_converter { - using source_traits = float_traits; - using result_traits = float_traits; - using source_bits = typename source_traits::bits_type; - using result_bits = typename result_traits::bits_type; - - static_assert(source_traits::exponent_bits >= - result_traits::exponent_bits && - source_traits::significand_bits >= - result_traits::significand_bits, - "SourceType has to have both lower range and precision or " - "higher range and precision than ResultType"); - - static constexpr int significand_offset = - source_traits::significand_bits - result_traits::significand_bits; - static constexpr int exponent_offset = significand_offset; - static constexpr int sign_offset = source_traits::exponent_bits - - result_traits::exponent_bits + - exponent_offset; - static constexpr source_bits bias_change = - (source_traits::bias_mask >> exponent_offset) - - static_cast(result_traits::bias_mask); - - static constexpr result_bits shift_significand(source_bits data) noexcept - { - return static_cast( - (data & source_traits::significand_mask) >> significand_offset); - } - - static constexpr result_bits shift_exponent(source_bits data) noexcept - { - return static_cast(update_bias( - (data & source_traits::exponent_mask) >> exponent_offset)); - } - - static constexpr result_bits shift_sign(source_bits data) noexcept - { - return static_cast((data & source_traits::sign_mask) >> - sign_offset); - } - -private: - static constexpr source_bits update_bias(source_bits data) noexcept - { - return data <= bias_change ? typename source_traits::bits_type{} - : limit_exponent(data - bias_change); - } - - static constexpr source_bits limit_exponent(source_bits data) noexcept - { - return data >= static_cast(result_traits::exponent_mask) - ? static_cast(result_traits::exponent_mask) - : data; - } -}; - - -} // namespace detail - - -/** - * A class providing basic support for half precision floating point types. - * - * For now the only features are reduced storage compared to single precision - * and conversions from and to single precision floating point type. - */ -class half { -public: - half() noexcept = default; - - GKO_ATTRIBUTES half(float32 val) noexcept - { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto tmp = __float2half_rn(val); - data_ = reinterpret_cast(tmp); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - data_ = float2half(reinterpret_cast(val)); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - } - - GKO_ATTRIBUTES half(float64 val) noexcept : half(static_cast(val)) - {} - - GKO_ATTRIBUTES operator float32() const noexcept - { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __half2float(reinterpret_cast(data_)); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto bits = half2float(data_); - return reinterpret_cast(bits); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - } - - GKO_ATTRIBUTES operator float64() const noexcept - { - return static_cast(static_cast(*this)); - } - - GKO_ATTRIBUTES half operator-() const noexcept - { - auto res = *this; - // flip sign bit - res.data_ ^= f16_traits::sign_mask; - return res; - } - -private: - using f16_traits = detail::float_traits; - using f32_traits = detail::float_traits; - - static uint16 float2half(uint32 data_) noexcept - { - using conv = detail::precision_converter; - if (f32_traits::is_inf(data_)) { - return conv::shift_sign(data_) | f16_traits::exponent_mask; - } else if (f32_traits::is_nan(data_)) { - return conv::shift_sign(data_) | f16_traits::exponent_mask | - f16_traits::significand_mask; - } else { - const auto exp = conv::shift_exponent(data_); - if (f16_traits::is_inf(exp)) { - return conv::shift_sign(data_) | exp; - } else if (f16_traits::is_denom(exp)) { - // TODO: handle denormals - return conv::shift_sign(data_); - } else { - return conv::shift_sign(data_) | exp | - conv::shift_significand(data_); - } - } - } - - static uint32 half2float(uint16 data_) noexcept - { - using conv = detail::precision_converter; - if (f16_traits::is_inf(data_)) { - return conv::shift_sign(data_) | f32_traits::exponent_mask; - } else if (f16_traits::is_nan(data_)) { - return conv::shift_sign(data_) | f32_traits::exponent_mask | - f32_traits::significand_mask; - } else if (f16_traits::is_denom(data_)) { - // TODO: handle denormals - return conv::shift_sign(data_); - } else { - return conv::shift_sign(data_) | conv::shift_exponent(data_) | - conv::shift_significand(data_); - } - } - - uint16 data_; -}; - - /** * This template implements the truncated (or split) storage of a floating point * type. @@ -488,38 +156,6 @@ class truncated { namespace std { -template <> -class complex { -public: - using value_type = gko::half; - - complex(const value_type& real = 0.f, const value_type& imag = 0.f) - : real_(real), imag_(imag) - {} - - template - explicit complex(const complex& other) - : complex(static_cast(other.real()), - static_cast(other.imag())) - {} - - value_type real() const noexcept { return real_; } - - value_type imag() const noexcept { return imag_; } - - - operator std::complex() const noexcept - { - return std::complex(static_cast(real_), - static_cast(imag_)); - } - -private: - value_type real_; - value_type imag_; -}; - - template class complex> { public: @@ -551,31 +187,6 @@ class complex> { }; -template <> -struct is_scalar : std::true_type {}; - - -template <> -struct numeric_limits { - static constexpr bool is_specialized{true}; - static constexpr bool is_signed{true}; - static constexpr bool is_integer{false}; - static constexpr bool is_exact{false}; - static constexpr bool is_bounded{true}; - static constexpr bool is_modulo{false}; - static constexpr int digits{ - gko::detail::float_traits::significand_bits + 1}; - // 3/10 is approx. log_10(2) - static constexpr int digits10{digits * 3 / 10}; - - // Note: gko::half can't return gko::half here because it does not have - // a constexpr constructor. - static constexpr float epsilon() - { - return gko::detail::float_traits::eps; - } -}; - } // namespace std diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index b5c1e37569b..f23a3352ed0 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -35,45 +35,224 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #ifdef GINKGO_MIXED_PRECISION + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ + GKO_ADAPT_HF(_macro(float, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, float, bfloat16, __VA_ARGS__)); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(float, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, double, bfloat16, __VA_ARGS__)); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ + GKO_ADAPT_HF(_macro(double, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, float, bfloat16, __VA_ARGS__)); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, double, bfloat16, __VA_ARGS__)); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, double, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(_macro, ...) \ + GKO_ADAPT_HF(_macro(bfloat16, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, double, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + #else + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ template _macro(float, float, float, __VA_ARGS__) @@ -88,6 +267,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(_macro, ...) \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, bfloat16, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(_macro, ...) \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + + #endif @@ -95,7 +289,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ - GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__) + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(_macro, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ @@ -104,20 +302,60 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(float, double, __VA_ARGS__); \ - template _macro(double, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, bfloat16, __VA_ARGS__)); \ + template _macro(float, float, __VA_ARGS__); \ + template _macro(float, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, bfloat16, __VA_ARGS__)); \ + template _macro(double, float, __VA_ARGS__); \ + template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16, __VA_ARGS__)); \ + template _macro(float, float, __VA_ARGS__); \ + template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index de4f6ec1e86..a0f612ca2fd 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -778,19 +779,35 @@ static constexpr uint64 binary_format_magic() { constexpr auto is_int = std::is_same::value; constexpr auto is_long = std::is_same::value; + constexpr auto is_half = std::is_same::value; + constexpr auto is_bfloat16 = std::is_same::value; constexpr auto is_double = std::is_same::value; constexpr auto is_float = std::is_same::value; constexpr auto is_complex_double = std::is_same>::value; constexpr auto is_complex_float = std::is_same>::value; + constexpr auto is_complex_half = + std::is_same>::value; + constexpr auto is_complex_bfloat16 = + std::is_same>::value; static_assert(is_int || is_long, "invalid storage index type"); - static_assert( - is_double || is_float || is_complex_double || is_complex_float, - "invalid storage value type"); + static_assert(is_bfloat16 || is_complex_bfloat16 || is_half || + is_complex_half || is_double || is_float || + is_complex_double || is_complex_float, + "invalid storage value type"); constexpr auto index_bit = is_int ? 'I' : 'L'; constexpr auto value_bit = - is_double ? 'D' : (is_float ? 'S' : (is_complex_double ? 'Z' : 'C')); + is_double + ? 'D' + : (is_float + ? 'S' + : (is_complex_double + ? 'Z' + : (is_complex_float + ? 'C' + : (is_half ? 'H' + : (is_bfloat16 ? 'B' : 'X'))))); constexpr uint64 shift = 256; constexpr uint64 type_bits = index_bit * shift + value_bit; return 'G' + @@ -900,12 +917,16 @@ matrix_data read_binary_raw(std::istream& is) } DECLARE_OVERLOAD(double, int32) DECLARE_OVERLOAD(float, int32) + DECLARE_OVERLOAD(half, int32) DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(std::complex, int32) + DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(double, int64) DECLARE_OVERLOAD(float, int64) + DECLARE_OVERLOAD(half, int64) DECLARE_OVERLOAD(std::complex, int64) DECLARE_OVERLOAD(std::complex, int64) + DECLARE_OVERLOAD(std::complex, int64) #undef DECLARE_OVERLOAD else { diff --git a/core/base/utils.hpp b/core/base/utils.hpp index 6c5bfb783dd..4250d35e8ef 100644 --- a/core/base/utils.hpp +++ b/core/base/utils.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/components/absolute_array_kernels.hpp b/core/components/absolute_array_kernels.hpp index 94ec12e98a5..affa5f27eb0 100644 --- a/core/components/absolute_array_kernels.hpp +++ b/core/components/absolute_array_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/components/fill_array_kernels.hpp b/core/components/fill_array_kernels.hpp index 607e99d036e..3da114961c7 100644 --- a/core/components/fill_array_kernels.hpp +++ b/core/components/fill_array_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp index 76d5ad6000b..e46814b6351 100644 --- a/core/components/format_conversion_kernels.hpp +++ b/core/components/format_conversion_kernels.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include diff --git a/core/components/precision_conversion_kernels.hpp b/core/components/precision_conversion_kernels.hpp index 13da41d72d8..1abf78d1c19 100644 --- a/core/components/precision_conversion_kernels.hpp +++ b/core/components/precision_conversion_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/components/prefix_sum_kernels.hpp b/core/components/prefix_sum_kernels.hpp index 09a34f5931b..277c13ff7ba 100644 --- a/core/components/prefix_sum_kernels.hpp +++ b/core/components/prefix_sum_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/core/components/reduce_array_kernels.hpp b/core/components/reduce_array_kernels.hpp index 5ff591e71df..845a77f5409 100644 --- a/core/components/reduce_array_kernels.hpp +++ b/core/components/reduce_array_kernels.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index f8de8dbaef0..d0d49583533 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -139,6 +139,85 @@ void Matrix::move_to( } +#if GINKGO_ENABLE_HALF +template +void Matrix::convert_to( + Matrix>, local_index_type, + global_index_type>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix>, local_index_type, + global_index_type>* result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} + + +template +void Matrix::convert_to( + Matrix, local_index_type, global_index_type>* + result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix, local_index_type, global_index_type>* + result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} +#endif + template void Matrix::read_distributed( const device_matrix_data& data, diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index bda7c30b88b..8a341cad9b0 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 001cf75b76d..5b1549b393a 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -290,6 +290,44 @@ void Vector::move_to(Vector>* result) } +#if GINKGO_ENABLE_HALF +template +void Vector::convert_to( + Vector>>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to( + Vector>>* result) +{ + this->convert_to(result); +} + + +template +void Vector::convert_to( + Vector>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to(Vector>* result) +{ + this->convert_to(result); +} +#endif + template std::unique_ptr::absolute_type> Vector::compute_absolute() const diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index 009bed918a3..4ac2cfc23a3 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp index 85d5fc5a3ae..9a240ad455b 100644 --- a/core/factorization/factorization_kernels.hpp +++ b/core/factorization/factorization_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp index 12209b0d1c5..d9337c5c5a0 100644 --- a/core/factorization/ilu_kernels.hpp +++ b/core/factorization/ilu_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp index d3e7aea8f08..1e41b9e9b85 100644 --- a/core/factorization/lu_kernels.hpp +++ b/core/factorization/lu_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/par_ic_kernels.hpp b/core/factorization/par_ic_kernels.hpp index 8a461501fc7..f5356fd334b 100644 --- a/core/factorization/par_ic_kernels.hpp +++ b/core/factorization/par_ic_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp index c6049a220f3..198d10db87a 100644 --- a/core/factorization/par_ict_kernels.hpp +++ b/core/factorization/par_ict_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp index 8a8bd96314f..51f43ea1e74 100644 --- a/core/factorization/par_ilu_kernels.hpp +++ b/core/factorization/par_ilu_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp index 98d908e5c83..b4c4747159d 100644 --- a/core/factorization/par_ilut_kernels.hpp +++ b/core/factorization/par_ilut_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/log/papi.cpp b/core/log/papi.cpp index ff1cc1de3d0..e9bb541e4b6 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -243,15 +243,13 @@ void Papi::on_criterion_check_completed( double residual_norm_d = 0.0; if (residual_norm != nullptr) { auto dense_r_norm = as(residual_norm); - residual_norm_d = - static_cast(std::real(dense_r_norm->at(0, 0))); + residual_norm_d = static_cast(real(dense_r_norm->at(0, 0))); } else if (residual != nullptr) { detail::vector_dispatch(residual, [&](const auto* dense_r) { auto tmp_res_norm = Vector::create( residual->get_executor(), dim<2>{1, residual->get_size()[1]}); dense_r->compute_norm2(tmp_res_norm); - residual_norm_d = - static_cast(std::real(tmp_res_norm->at(0, 0))); + residual_norm_d = static_cast(real(tmp_res_norm->at(0, 0))); }); } diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 6d28cf2f7b7..723e827aee5 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -144,6 +144,46 @@ void Coo::move_to( } +#if GINKGO_ENABLE_HALF +template +void Coo::convert_to( + Coo>, IndexType>* result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo>, IndexType>* result) +{ + this->convert_to(result); +} + + +template +void Coo::convert_to( + Coo, IndexType>* result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo, IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Coo::convert_to( Csr* result) const diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index 84db65e27fc..2527a6e675f 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 9a4697c1195..f8466fa752d 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -257,6 +257,47 @@ void Csr::move_to( this->convert_to(result); } +#if GINKGO_ENABLE_HALF +template +void Csr::convert_to( + Csr>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr>, IndexType>* result) +{ + this->convert_to(result); +} + + +template +void Csr::convert_to( + Csr, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr, IndexType>* result) +{ + this->convert_to(result); +} +#endif + template void Csr::convert_to( diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 42a92ca1b84..2b7f9e2befd 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index 733ef9214ba..a1cded747ce 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 17dec93c234..bc37226bcd9 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -585,6 +585,55 @@ void Dense::move_to(Dense>* result) } +#if GINKGO_ENABLE_HALF +template +void Dense::convert_to( + Dense>>* result) const +{ + if (result->get_size() != this->get_size()) { + result->set_size(this->get_size()); + result->stride_ = stride_; + result->values_.resize_and_reset(result->get_size()[0] * + result->stride_); + } + auto exec = this->get_executor(); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void Dense::move_to( + Dense>>* result) +{ + this->convert_to(result); +} + + +template +void Dense::convert_to( + Dense>* result) const +{ + if (result->get_size() != this->get_size()) { + result->set_size(this->get_size()); + result->stride_ = stride_; + result->values_.resize_and_reset(result->get_size()[0] * + result->stride_); + } + auto exec = this->get_executor(); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void Dense::move_to(Dense>* result) +{ + this->convert_to(result); +} +#endif + + template template void Dense::convert_impl(Coo* result) const @@ -1343,7 +1392,9 @@ void gather_mixed_real_complex(Function fn, LinOp* out) #ifdef GINKGO_MIXED_PRECISION using fst_type = matrix::Dense; using snd_type = matrix::Dense>; - run(out, fn); + using trd_type = matrix::Dense>>; + using fth_type = matrix::Dense>; + run(out, fn); #else precision_dispatch(fn, out); #endif diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 9a487fadeda..4cf8a1cac2a 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 17edfb9cd8b..f4d85469780 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -193,6 +193,41 @@ void Diagonal::move_to(Diagonal>* result) } +#if GINKGO_ENABLE_HALF +template +void Diagonal::convert_to( + Diagonal>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Diagonal::move_to( + Diagonal>>* result) +{ + this->convert_to(result); +} + +template +void Diagonal::convert_to( + Diagonal>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Diagonal::move_to( + Diagonal>* result) +{ + this->convert_to(result); +} +#endif + + template void Diagonal::convert_to(Csr* result) const { diff --git a/core/matrix/diagonal_kernels.hpp b/core/matrix/diagonal_kernels.hpp index 9d3e7901dc5..88a4b790458 100644 --- a/core/matrix/diagonal_kernels.hpp +++ b/core/matrix/diagonal_kernels.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 4c859656866..8bdbeed628f 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -202,6 +202,48 @@ void Ell::move_to( } +#if GINKGO_ENABLE_HALF +template +void Ell::convert_to( + Ell>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell>, IndexType>* result) +{ + this->convert_to(result); +} + + +template +void Ell::convert_to( + Ell, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell, IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Ell::convert_to(Dense* result) const { diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index f5494871791..8842a4b0c3a 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -198,6 +198,51 @@ void Fbcsr::move_to( } +#if GINKGO_ENABLE_HALF +template +void Fbcsr::convert_to( + Fbcsr>, IndexType>* const result) + const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + // block sizes are immutable except for assignment/conversion + result->bs_ = this->bs_; +} + + +template +void Fbcsr::move_to( + Fbcsr>, IndexType>* const result) +{ + this->convert_to(result); +} + + +template +void Fbcsr::convert_to( + Fbcsr, IndexType>* const result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + // block sizes are immutable except for assignment/conversion + result->bs_ = this->bs_; +} + + +template +void Fbcsr::move_to( + Fbcsr, IndexType>* const result) +{ + this->convert_to(result); +} +#endif + + template void Fbcsr::convert_to( Dense* const result) const diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index c180527a216..6a8bfe259e9 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/matrix/fft_kernels.hpp b/core/matrix/fft_kernels.hpp index 09e16dc8a1a..7de42cedc13 100644 --- a/core/matrix/fft_kernels.hpp +++ b/core/matrix/fft_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index b49a6241c37..fd71683404d 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -181,6 +181,50 @@ void Hybrid::move_to( } +#if GINKGO_ENABLE_HALF +template +void Hybrid::convert_to( + Hybrid>, IndexType>* result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in + // Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid>, IndexType>* result) +{ + this->convert_to(result); +} + + +template +void Hybrid::convert_to( + Hybrid, IndexType>* result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in + // Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid, IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Hybrid::convert_to(Dense* result) const { diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 442b192f07d..2c084a253a6 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -46,7 +47,15 @@ namespace matrix { template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { - run*, const Dense*, + run< +#if GINKGO_ENABLE_HALF + const Dense*, const Dense*, +#endif + const Dense*, const Dense*, +#if GINKGO_ENABLE_HALF + const Dense>*, + const Dense>*, +#endif const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); } @@ -55,7 +64,15 @@ template void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { - run*, const Dense*, + run< +#if GINKGO_ENABLE_HALF + const Dense*, const Dense*, +#endif + const Dense*, const Dense*, +#if GINKGO_ENABLE_HALF + const Dense>*, + const Dense>*, +#endif const Dense>*, const Dense>*>( in, [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); }); diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index 7a343d8e97f..880fc36d827 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -178,6 +178,52 @@ void Sellp::move_to( } +#if GINKGO_ENABLE_HALF +template +void Sellp::convert_to( + Sellp>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp>, IndexType>* result) +{ + this->convert_to(result); +} + + +template +void Sellp::convert_to( + Sellp, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp, IndexType>* result) +{ + this->convert_to(result); +} +#endif + + template void Sellp::convert_to(Dense* result) const { diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index 8f80e738b91..d5cebdb7007 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 5e4ff888034..a37a3f9050b 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -176,7 +177,7 @@ void Pgm::generate() auto abs_mtx = pgm_op->compute_absolute(); // abs_mtx is already real valuetype, so transpose is enough auto weight_mtx = gko::as(abs_mtx->transpose()); - auto half_scalar = initialize>({0.5}, exec); + auto half_scalar = initialize>({half(0.5)}, exec); auto identity = matrix::Identity::create(exec, num_rows); // W = (abs_mtx + transpose(abs_mtx))/2 abs_mtx->apply(half_scalar, identity, half_scalar, weight_mtx); @@ -237,6 +238,5 @@ void Pgm::generate() #define GKO_DECLARE_PGM(_vtype, _itype) class Pgm<_vtype, _itype> GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM); - } // namespace multigrid } // namespace gko diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index f6f3e8018c5..717385fdb9a 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -317,10 +317,13 @@ void Jacobi::generate(const LinOp* system_matrix, if (parameters_.max_block_size == 1) { auto diag = share(as(system_matrix) ->extract_diagonal_linop()); - auto diag_vt = - ::gko::detail::temporary_conversion>:: - template create>>( - diag.get()); + auto diag_vt = ::gko::detail:: + temporary_conversion>::template create< + matrix::Diagonal>, + matrix::Diagonal< + previous_precision>>, + matrix::Diagonal>>( + diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); } diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 957d5b4a324..bcf463775a9 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_PRECONDITIONER_JACOBI_UTILS_HPP_ +#include #include #include @@ -144,21 +145,23 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions( auto supported = static_cast(prd::p0n0); // the following code uses short-circuiting to avoid calling possibly // expensive verificatiors multiple times - if (accurate(float_traits>>::eps)) { + if (accurate(type(float_traits>>::eps))) { supported |= prd::p2n0; } - if (accurate(float_traits>>::eps) && + if (accurate( + type(float_traits>>::eps)) && (is_verified1 = verificator1())) { supported |= prd::p1n1; } - if (accurate(float_traits>>::eps) && + if (accurate(type( + float_traits>>::eps)) && is_verified1 != 0 && verificator2()) { supported |= prd::p0n2; } - if (accurate(float_traits>::eps)) { + if (accurate(type(float_traits>::eps))) { supported |= prd::p1n0; } - if (accurate(float_traits>::eps) && + if (accurate(type(float_traits>::eps)) && (is_verified1 == 1 || (is_verified1 == 2 && (is_verified1 = verificator1())))) { supported |= prd::p0n1; diff --git a/core/reorder/rcm_kernels.hpp b/core/reorder/rcm_kernels.hpp index 4fde334a26b..5f4b329c554 100644 --- a/core/reorder/rcm_kernels.hpp +++ b/core/reorder/rcm_kernels.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp index 6f22feb9446..6f1244de6fa 100644 --- a/core/solver/bicg_kernels.hpp +++ b/core/solver/bicg_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp index bdd2a18db48..81cb41fa605 100644 --- a/core/solver/bicgstab_kernels.hpp +++ b/core/solver/bicgstab_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index be9dbbf0fdb..353e3703d2d 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -514,8 +515,8 @@ void CbGmres::apply_impl(const LinOp* alpha, const LinOp* b, #define GKO_DECLARE_CB_GMRES(_type1) class CbGmres<_type1> #define GKO_DECLARE_CB_GMRES_TRAITS(_type1) \ struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_TRAITS); } // namespace solver diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp index 125dc5e901c..0743b706453 100644 --- a/core/solver/cb_gmres_accessor.hpp +++ b/core/solver/cb_gmres_accessor.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index a0040bc24cf..f584a364e5e 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp index d1eb99ef6ec..81b83007667 100644 --- a/core/solver/cg_kernels.hpp +++ b/core/solver/cg_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp index a618e8d7a9a..9d44540f347 100644 --- a/core/solver/cgs_kernels.hpp +++ b/core/solver/cgs_kernels.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp index bde667b79d8..a1288301145 100644 --- a/core/solver/common_gmres_kernels.hpp +++ b/core/solver/common_gmres_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index bd236f8a158..8d0ef899fee 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 52b1eddc11f..4dd5c0fc260 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -70,6 +71,10 @@ std::unique_ptr Idr::transpose() const .with_generated_preconditioner( share(as(this->get_preconditioner())->transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate( share(as(this->get_system_matrix())->transpose())); @@ -83,6 +88,10 @@ std::unique_ptr Idr::conj_transpose() const .with_generated_preconditioner(share( as(this->get_preconditioner())->conj_transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate(share( as(this->get_system_matrix())->conj_transpose())); diff --git a/core/solver/idr_kernels.hpp b/core/solver/idr_kernels.hpp index 1db367622b6..7fbf73f325e 100644 --- a/core/solver/idr_kernels.hpp +++ b/core/solver/idr_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp index ef4633d61f0..b29d624dac6 100644 --- a/core/solver/ir_kernels.hpp +++ b/core/solver/ir_kernels.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 303106fa4f6..de4f711d423 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -314,7 +315,14 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto next_nrows = mg_level_list.at(i)->get_coarse_op()->get_size()[0]; auto mg_level = mg_level_list.at(i); - run, std::complex, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level, auto i, auto cycle, auto current_nrows, @@ -371,7 +379,14 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, return; } auto mg_level = multigrid->get_mg_level_list().at(level); - run, std::complex, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level) { using value_type = @@ -516,7 +531,14 @@ void Multigrid::generate() break; } - run, std::complex, +#endif std::complex, std::complex>( mg_level, [this](auto mg_level, auto index, auto matrix) { @@ -554,7 +576,14 @@ void Multigrid::generate() auto last_mg_level = mg_level_list_.back(); // generate coarsest solver - run, std::complex, +#endif std::complex, std::complex>( last_mg_level, [this](auto mg_level, auto level, auto matrix) { @@ -640,7 +669,14 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, b, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } @@ -679,7 +715,14 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, alpha, b, beta, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex, +#endif std::complex, std::complex>(first_mg_level, lambda, alpha, b, beta, x); } @@ -744,7 +787,14 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } diff --git a/core/solver/multigrid_kernels.hpp b/core/solver/multigrid_kernels.hpp index 4869fd0ddad..fa246283b36 100644 --- a/core/solver/multigrid_kernels.hpp +++ b/core/solver/multigrid_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp index 8d4fb395841..7a9d537fe8a 100644 --- a/core/stop/criterion_kernels.hpp +++ b/core/stop/criterion_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/core/stop/residual_norm_kernels.hpp b/core/stop/residual_norm_kernels.hpp index f9c2ce89f93..c17f9dabfd8 100644 --- a/core/stop/residual_norm_kernels.hpp +++ b/core/stop/residual_norm_kernels.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp index b12fba6ad0f..d6649e5f4c7 100644 --- a/core/test/accessor/reduced_row_major_ginkgo.cpp +++ b/core/test/accessor/reduced_row_major_ginkgo.cpp @@ -40,6 +40,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include // necessary for gko::half + + #include "accessor/index_span.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index 36bad656b07..200c181e513 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -10,6 +10,7 @@ ginkgo_create_test(dim) ginkgo_create_test(exception) ginkgo_create_test(exception_helpers) ginkgo_create_test(extended_float) +ginkgo_create_test(extended_bfloat16) ginkgo_create_test(executor) ginkgo_create_test(iterator_factory) ginkgo_create_test(lin_op) diff --git a/core/test/base/extended_bfloat16.cpp b/core/test/base/extended_bfloat16.cpp new file mode 100644 index 00000000000..4681f292325 --- /dev/null +++ b/core/test/base/extended_bfloat16.cpp @@ -0,0 +1,331 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +#include + + +#include + + +#include + + +#include "core/base/extended_float.hpp" + +namespace { + + +template +struct floating_impl; + +template <> +struct floating_impl<16> { + using type = gko::bfloat16; +}; + +template <> +struct floating_impl<32> { + using type = float; +}; + +template <> +struct floating_impl<64> { + using type = double; +}; + +template +using floating = typename floating_impl::type; + + +class ExtendedFloatTestBase : public ::testing::Test { +protected: + using bfloat16 = gko::bfloat16; + template + using truncated = gko::truncated; + + static constexpr auto byte_size = gko::byte_size; + + template + static floating create_from_bits(const char (&s)[N]) + { + auto bits = std::bitset(s).to_ullong(); + return reinterpret_cast&>(bits); + } + + template + static std::bitset get_bits(T val) + { + auto bits = + reinterpret_cast::bits_type&>( + val); + return std::bitset(bits); + } + + template + static std::bitset get_bits(const char (&s)[N]) + { + return std::bitset(s); + } +}; + + +class FloatToBFloat16 : public ExtendedFloatTestBase {}; + + +// clang-format does terrible formatting of string literal concatenation +// clang-format off + + +TEST_F(FloatToBFloat16, ConvertsOne) +{ + bfloat16 x = create_from_bits("0" "01111111" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsZero) +{ + bfloat16 x = create_from_bits("0" "00000000" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsInf) +{ + bfloat16 x = create_from_bits("0" "11111111" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsNegInf) +{ + bfloat16 x = create_from_bits("1" "11111111" "00000000000000000000000"); + + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, ConvertsNan) +{ + bfloat16 x = create_from_bits("0" "11111111" "00000000000000000000001"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "1000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "1111111")); + #endif +} + + +TEST_F(FloatToBFloat16, ConvertsNegNan) +{ + bfloat16 x = create_from_bits("1" "11111111" "00010000000000000000000"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "1000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "1111111")); + #endif +} + + +TEST_F(FloatToBFloat16, FlushesToZero) +{ + bfloat16 x = create_from_bits("0" "00000000" "00000000000100000001000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "0000000")); +} + + +TEST_F(FloatToBFloat16, FlushesToNegZero) +{ + bfloat16 x = create_from_bits("1" "00000000" "00000000000100000001000"); + + ASSERT_EQ(get_bits(x), get_bits("1" "00000000" "0000000")); +} + + +TEST_F(FloatToBFloat16, FlushesToInf) +{ + bfloat16 x = create_from_bits("0" "11111110" "11111111111111111111111"); + + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, FlushesToNegInf) +{ + bfloat16 x = create_from_bits("1" "11111110" "11111111111111111111111"); + + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "0000000")); +} + + +TEST_F(FloatToBFloat16, TruncatesSmallNumber) +{ + bfloat16 x = create_from_bits("0" "01110001" "10010000000000010000100"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01110001" "1001000")); +} + + +TEST_F(FloatToBFloat16, TruncatesLargeNumberRoundToEven) +{ + bfloat16 neg_x = create_from_bits("1" "10001110" "10010111111000010000100"); + bfloat16 neg_x2 = create_from_bits("1" "10001110" "10010101111000010000100"); + bfloat16 x = create_from_bits("0" "10001110" "10010111111000010000100"); + bfloat16 x2 = create_from_bits("0" "10001110" "10010101111000010000100"); + bfloat16 x3 = create_from_bits("0" "10001110" "10010101000000000000000"); + bfloat16 x4 = create_from_bits("0" "10001110" "10010111000000000000000"); + + EXPECT_EQ(get_bits(x), get_bits("0" "10001110" "1001100")); + EXPECT_EQ(get_bits(x2), get_bits("0" "10001110" "1001011")); + EXPECT_EQ(get_bits(x3), get_bits("0" "10001110" "1001010")); + EXPECT_EQ(get_bits(x4), get_bits("0" "10001110" "1001100")); + EXPECT_EQ(get_bits(neg_x), get_bits("1" "10001110" "1001100")); + EXPECT_EQ(get_bits(neg_x2), get_bits("1" "10001110" "1001011")); +} + + +TEST_F(FloatToBFloat16, Convert) +{ + float rho = 86.25; + float beta = 1110; + auto float_res = rho/beta; + gko::bfloat16 rho_h = rho; + gko::bfloat16 beta_h = beta; + auto bfloat16_res = rho_h/beta_h; + std::cout << float_res << std::endl; + std::cout << float(bfloat16_res) << std::endl; + + std::complex cpx{100.0, 0.0}; + std::cout << float(gko::squared_norm(cpx)) << std::endl; +} + +// clang-format on + + +class bfloat16ToFloat : public ExtendedFloatTestBase {}; + + +// clang-format off + + +TEST_F(bfloat16ToFloat, ConvertsOne) +{ + float x = create_from_bits("0" "01111111" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01111111" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsZero) +{ + float x = create_from_bits("0" "00000000" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "00000000" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsInf) +{ + float x = create_from_bits("0" "11111111" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsNegInf) +{ + float x = create_from_bits("1" "11111111" "0000000"); + + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ConvertsNan) +{ + float x = create_from_bits("0" "11111111" "0001001"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111")); + #endif +} + + +TEST_F(bfloat16ToFloat, ConvertsNegNan) +{ + float x = create_from_bits("1" "11111111" "0000001"); + + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000010000000000000000")); + #else + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111")); + #endif +} + + +TEST_F(bfloat16ToFloat, ExtendsSmallNumber) +{ + float x = create_from_bits("0" "01110001" "1000010"); + + ASSERT_EQ(get_bits(x), get_bits("0" "01110001" "10000100000000000000000")); +} + + +TEST_F(bfloat16ToFloat, ExtendsLargeNumber) +{ + float x = create_from_bits("1" "10001110" "1001001"); + + ASSERT_EQ(get_bits(x), get_bits("1" "10001110" "10010010000000000000000")); +} + + +// clang-format on + + +} // namespace diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index bab3ac9926f..c8d7b450701 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -34,12 +34,15 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include +#include + namespace { @@ -140,7 +143,13 @@ TEST_F(FloatToHalf, ConvertsNan) { half x = create_from_bits("0" "11111111" "00000000000000000000001"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111")); + #endif } @@ -148,7 +157,13 @@ TEST_F(FloatToHalf, ConvertsNegNan) { half x = create_from_bits("1" "11111111" "00010000000000000000000"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111")); + #endif } @@ -192,15 +207,39 @@ TEST_F(FloatToHalf, TruncatesSmallNumber) } -TEST_F(FloatToHalf, TruncatesLargeNumber) +TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven) { - half x = create_from_bits("1" "10001110" "10010011111000010000100"); + half neg_x = create_from_bits("1" "10001110" "10010011111000010000100"); + half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100"); + half x = create_from_bits("0" "10001110" "10010011111000010000100"); + half x2 = create_from_bits("0" "10001110" "10010011101000010000100"); + half x3 = create_from_bits("0" "10001110" "10010011101000000000000"); + half x4 = create_from_bits("0" "10001110" "10010011111000000000000"); + + EXPECT_EQ(get_bits(x), get_bits("0" "11110" "1001010000")); + EXPECT_EQ(get_bits(x2), get_bits("0" "11110" "1001001111")); + EXPECT_EQ(get_bits(x3), get_bits("0" "11110" "1001001110")); + EXPECT_EQ(get_bits(x4), get_bits("0" "11110" "1001010000")); + EXPECT_EQ(get_bits(neg_x), get_bits("1" "11110" "1001010000")); + EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111")); +} - ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001001111")); +TEST_F(FloatToHalf, Convert) +{ + float rho = 86.25; + float beta = 1110; + auto float_res = rho/beta; + gko::half rho_h = rho; + gko::half beta_h = beta; + auto half_res = rho_h/beta_h; + std::cout << float_res << std::endl; + std::cout << float(half_res) << std::endl; + + std::complex cpx{100.0, 0.0}; + std::cout << float(gko::squared_norm(cpx)) << std::endl; } - // clang-format on @@ -246,7 +285,13 @@ TEST_F(HalfToFloat, ConvertsNan) { float x = create_from_bits("0" "11111" "0001001000"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111")); + #endif } @@ -254,7 +299,13 @@ TEST_F(HalfToFloat, ConvertsNegNan) { float x = create_from_bits("1" "11111" "0000000001"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111")); + #endif } diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 3558a7d5564..82c5d831b70 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -413,17 +413,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on(A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -462,17 +462,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -519,21 +519,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -580,21 +580,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -818,11 +818,11 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) .with_criteria( gko::stop::Iteration::build().with_max_iters(3u).on(exec)) .on(exec); - auto solver = factory->generate(gko::initialize({1.1}, exec)); + auto solver = factory->generate(gko::initialize({1.25}, exec)); auto right_hand_side = gko::initialize({-5.5}, exec); - auto residual = gko::initialize({-4.4}, exec); - auto solution = gko::initialize({-2.2}, exec); - auto residual_norm = gko::initialize({-3.3}, exec); + auto residual = gko::initialize({-4.5}, exec); + auto solution = gko::initialize({-2.25}, exec); + auto residual_norm = gko::initialize({-3.125}, exec); gko::array stop_status(exec, 1); logger->template on( @@ -831,9 +831,9 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "-3.3"); + GKO_ASSERT_STR_CONTAINS(os, "-4.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "-3.125"); GKO_ASSERT_STR_CONTAINS(os, "Finalized:") } diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index f7ba80ebba1..fd4053617fd 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -60,8 +60,7 @@ class Gcr : public ::testing::Test { using Solver = gko::solver::Gcr; using Big_solver = gko::solver::Gcr; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + static const gko::remove_complex reduction_factor; Gcr() : exec(gko::ReferenceExecutor::create()), @@ -108,7 +107,8 @@ class Gcr : public ::testing::Test { }; template -constexpr gko::remove_complex Gcr::reduction_factor; +const gko::remove_complex Gcr::reduction_factor = + gko::remove_complex(1e-6); TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator); diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index 11cafe2c86f..3e54f7a6d04 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -60,8 +60,8 @@ class Gmres : public ::testing::Test { using Solver = gko::solver::Gmres; using Big_solver = gko::solver::Gmres; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + // half does not have constexpr constructor + static const gko::remove_complex reduction_factor; Gmres() : exec(gko::ReferenceExecutor::create()), @@ -97,7 +97,8 @@ class Gmres : public ::testing::Test { }; template -constexpr gko::remove_complex Gmres::reduction_factor; +const gko::remove_complex Gmres::reduction_factor = + gko::remove_complex(1e-6); TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator); diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 856f9651ebe..b8672ef7079 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -108,9 +108,7 @@ class DummyLinOpWithFactory std::make_shared(this->get_executor(), gko::dim<2>{n_, n_ - 1}), gko::share(gko::test::generate_random_dense_matrix( - n_ - 1, n_ - 1, - std::uniform_real_distribution>( - 0, 1), + n_ - 1, n_ - 1, std::uniform_real_distribution<>(0, 1), std::default_random_engine{}, factory->get_executor())), std::make_shared(this->get_executor(), gko::dim<2>{n_ - 1, n_})); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index a16db1eb93a..1982aae20a5 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -61,8 +62,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { namespace test { +#if GINKGO_ENABLE_HALF +#define OPTIONAL(...) __VA_ARGS__, +#else +#define OPTIONAL(...) +#endif using ValueTypes = +#if GINKGO_DPCPP_SINGLE_MODE + ::testing::Types) std::complex>; +#else + ::testing::Types) std::complex, + std::complex>; +#endif + +using ValueTypesNoHalf = #if GINKGO_DPCPP_SINGLE_MODE ::testing::Types>; #else @@ -70,6 +86,14 @@ using ValueTypes = #endif using ComplexValueTypes = +#if GINKGO_DPCPP_SINGLE_MODE + ::testing::Types) std::complex>; +#else + ::testing::Types) std::complex, + std::complex>; +#endif + +using ComplexValueTypesNoHalf = #if GINKGO_DPCPP_SINGLE_MODE ::testing::Types>; #else @@ -78,9 +102,9 @@ using ComplexValueTypes = using RealValueTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif @@ -102,58 +126,79 @@ using PODTypes = using ValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, gko::int32, gko::int64, - gko::size_type>; -#else - ::testing::Types, std::complex, + ::testing::Types) std::complex, gko::int32, gko::int64, gko::size_type>; +#else + ::testing::Types) std::complex, + std::complex, gko::int32, gko::int64, + gko::size_type>; #endif using RealValueAndIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; + ::testing::Types; #else - ::testing::Types; + ::testing::Types; #endif using ValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, - std::tuple, gko::int32>, + ::testing::Types) + std::tuple, + OPTIONAL(std::tuple, gko::int32>) + std::tuple, gko::int32>, std::tuple, std::tuple, gko::int64>>; #else - ::testing::Types< - std::tuple, std::tuple, - std::tuple, gko::int32>, - std::tuple, gko::int32>, - std::tuple, std::tuple, - std::tuple, gko::int64>, - std::tuple, gko::int64>>; + ::testing::Types) + std::tuple, + std::tuple, + OPTIONAL(std::tuple, gko::int32>) + std::tuple, gko::int32>, + std::tuple, gko::int32>, + OPTIONAL(std::tuple) + std::tuple, + std::tuple, + OPTIONAL(std::tuple, gko::int64>) + std::tuple, gko::int64>, + std::tuple, gko::int64>>; #endif using RealValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, - std::tuple>; + ::testing::Types) + std::tuple, + OPTIONAL(std::tuple) + std::tuple>; #else - ::testing::Types< - std::tuple, std::tuple, - std::tuple, std::tuple>; + ::testing::Types) + std::tuple, + std::tuple, + OPTIONAL(std::tuple) + std::tuple, + std::tuple>; #endif using ComplexValueIndexTypes = #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types, gko::int32>, - std::tuple, gko::int64>>; + ::testing::Types) gko::int32>, + std::tuple, gko::int32>, + OPTIONAL(std::tuple, gko::int64>) + std::tuple, gko::int64>> ; #else - ::testing::Types, gko::int32>, + ::testing::Types, gko::int32>) + std::tuple, gko::int32>, std::tuple, gko::int32>, - std::tuple, gko::int64>, + OPTIONAL(std::tuple, gko::int64>) + std::tuple, gko::int64>, std::tuple, gko::int64>>; #endif @@ -214,15 +259,14 @@ template struct reduction_factor { using nc_output = remove_complex; using nc_precision = remove_complex; - static constexpr nc_output value{ - std::numeric_limits::epsilon() * nc_output{10} * - (gko::is_complex() ? nc_output{1.4142} : one())}; + static nc_output value; }; template -constexpr remove_complex - reduction_factor::value; +remove_complex reduction_factor::value = + std::numeric_limits::epsilon() * nc_output{10} * + (gko::is_complex() ? nc_output{1.4142} : one()); } // namespace test @@ -304,4 +348,60 @@ struct TupleTypenameNameGenerator { }; +namespace detail { + + +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = float; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + +} // namespace detail + +template +using next_precision = typename detail::next_precision_impl::type; + + +#define SKIP_IF_HALF(type) \ + if (std::is_same, gko::half>::value) { \ + GTEST_SKIP() << "Skip due to half mode"; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + +#define SKIP_IF_BFLOAT16(type) \ + if (std::is_same, gko::bfloat16>::value) { \ + GTEST_SKIP() << "Skip due to bfloat16 mode"; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp index 72214c49d7c..e38fa72aaed 100644 --- a/core/test/utils/array_generator_test.cpp +++ b/core/test/utils/array_generator_test.cpp @@ -53,7 +53,7 @@ class ArrayGenerator : public ::testing::Test { ArrayGenerator() : exec(gko::ReferenceExecutor::create()) { array = gko::test::generate_random_array( - 500, std::normal_distribution>(20.0, 5.0), + 500, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec); } @@ -65,8 +65,10 @@ class ArrayGenerator : public ::testing::Test { InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + // use double to avoid rounding error + double res = 0; + // can not use ValueType when it is bfloat16 + int num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); res += pow(closure_op(tmp) - c, n); diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 153907cf2cf..7bf5db1b5e9 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -669,15 +670,19 @@ ::testing::AssertionResult values_near, std::complex>( std::complex val2, double abs_error) { using T = std::complex; - const double diff = abs(T{val1} - T{val2}); + // T{val1} calls the constructor of complex() -> which gives the + // complex(double/float) ambiguous + T Tval1 = val1; + T Tval2 = val2; + const double diff = abs(Tval1 - Tval2); if (diff <= abs_error) return ::testing::AssertionSuccess(); return ::testing::AssertionFailure() << "The difference between " << first_expression << " and " << second_expression << " is " << diff << ", which exceeds " << tolerance_expression << ", where\n" - << first_expression << " evaluates to " << T{val1} << ",\n" - << second_expression << " evaluates to " << T{val2} << ", and\n" + << first_expression << " evaluates to " << Tval1 << ",\n" + << second_expression << " evaluates to " << Tval2 << ", and\n" << tolerance_expression << " evaluates to " << abs_error << "."; } diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 7c43b0905c1..71c92f6b990 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -161,16 +161,15 @@ std::unique_ptr> generate_fbcsr_from_csr( const IndexType* const row_ptrs = fmtx->get_const_row_ptrs(); const IndexType* const col_idxs = fmtx->get_const_col_idxs(); ValueType* const vals = fmtx->get_values(); - std::uniform_real_distribution> - off_diag_dist(-1.0, 1.0); + std::uniform_real_distribution<> off_diag_dist(-1.0, 1.0); for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) { if (row_diag_dominant) { const IndexType nrownz = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size; - std::uniform_real_distribution> - diag_dist(1.01 * nrownz, 2 * nrownz); + std::uniform_real_distribution<> diag_dist(1.01 * nrownz, + 2 * nrownz); for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1]; ibz++) { @@ -235,13 +234,11 @@ std::unique_ptr> generate_random_fbcsr( matrix::Csr>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref) + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref) : generate_random_matrix>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref); + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref); if (unsort && rand_csr_ref->is_sorted_by_column_index()) { unsort_matrix(rand_csr_ref, engine); } diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp index fe11e27ac9d..ebb45fccefb 100644 --- a/core/test/utils/fb_matrix_generator_test.cpp +++ b/core/test/utils/fb_matrix_generator_test.cpp @@ -59,8 +59,8 @@ class BlockMatrixGenerator : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix< gko::matrix::Csr>( - nbrows, nbcols, std::normal_distribution(10, 5), - std::normal_distribution(20.0, 5.0), + nbrows, nbcols, std::normal_distribution<>(10, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), rbmtx(gko::test::generate_fbcsr_from_csr( exec, mtx.get(), blk_sz, false, std::default_random_engine(42))), diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 6928c5424a5..1194f86ef77 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -596,9 +596,13 @@ gko::matrix_data generate_tridiag_inverse_matrix_data( auto off_diag = i < j ? upper : lower; auto min_idx = std::min(i, j); auto max_idx = std::max(i, j); + // TODO: NVHPC requires explicitly casting to single precision + // from half. auto val = sign * - static_cast( - std::pow(off_diag, max_idx - min_idx)) * + static_cast(std::pow( + typename gko::detail::arth_type::type{ + off_diag}, + max_idx - min_idx)) * alpha[min_idx] * beta[max_idx + 1] / alpha.back(); md.nonzeros.emplace_back(i, j, val); } diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 411d5ec17d0..e703647ce9e 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -51,31 +51,33 @@ template class MatrixGenerator : public ::testing::Test { protected: using value_type = T; + using check_type = + typename gko::detail::arth_type>::type; using real_type = gko::remove_complex; using mtx_type = gko::matrix::Dense; MatrixGenerator() : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix( - 500, 100, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), dense_mtx(gko::test::generate_random_dense_matrix( - 500, 100, std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(41), exec)), l_mtx(gko::test::generate_random_lower_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), u_mtx(gko::test::generate_random_upper_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), lower_bandwidth(2), upper_bandwidth(3), band_mtx(gko::test::generate_random_band_matrix( 100, lower_bandwidth, upper_bandwidth, - std::normal_distribution(20.0, 5.0), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), nnz_per_row_sample(500, 0), values_sample(0), @@ -127,15 +129,15 @@ class MatrixGenerator : public ::testing::Test { template - ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, - InputIterator sample_end, Closure closure_op) + check_type get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + check_type res = 0; + check_type num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); - res += pow(closure_op(tmp) - c, n); + res += pow(check_type{closure_op(tmp)} - check_type{c}, n); num_elems += 1; } return res / num_elems; @@ -278,7 +280,7 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto diag = gko::test::detail::get_rand_value(dist, engine); @@ -302,18 +304,23 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagInverseMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto upper = gko::test::detail::get_rand_value(dist, engine); // make diagonally dominant auto diag = std::abs(gko::test::detail::get_rand_value(dist, engine)) + std::abs(lower) + std::abs(upper); + gko::size_type size = 50; + if (std::is_same>::value) { + // half precision can only handle small matrix + size = 5; + } auto mtx = gko::test::generate_tridiag_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto inv_mtx = gko::test::generate_tridiag_inverse_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto result = Dense::create(this->exec, mtx->get_size()); inv_mtx->apply(mtx, result); diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 31a6072270e..c4a2e26af6b 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -62,8 +62,8 @@ class MatrixUtils : public ::testing::Test { MatrixUtils() : exec(gko::ReferenceExecutor::create()), data(gko::test::generate_random_matrix_data( - 500, 500, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 500, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42))), rectangular_data(gko::dim<2>(500, 100)) {} @@ -258,9 +258,9 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) using T = typename TestFixture::value_type; auto cpy_data = this->data; - gko::utils::make_hpd(this->data, 1.001); + gko::utils::make_hpd(this->data, 1.01); gko::utils::make_hermitian(cpy_data); - gko::utils::make_diag_dominant(cpy_data, 1.001); + gko::utils::make_diag_dominant(cpy_data, 1.01); auto mtx = TestFixture::mtx_type::create(this->exec); mtx->read(this->data); @@ -273,7 +273,7 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.02; auto cpy_data = this->data; gko::utils::make_hpd(this->data, ratio); @@ -293,9 +293,9 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) using T = typename TestFixture::value_type; auto cpy_data = this->data; - gko::utils::make_spd(this->data, 1.001); + gko::utils::make_spd(this->data, 1.01); gko::utils::make_symmetric(cpy_data); - gko::utils::make_diag_dominant(cpy_data, 1.001); + gko::utils::make_diag_dominant(cpy_data, 1.01); auto mtx = TestFixture::mtx_type::create(this->exec); mtx->read(this->data); @@ -308,7 +308,7 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeSpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.02; auto cpy_data = this->data; gko::utils::make_spd(this->data, ratio); diff --git a/core/test/utils/value_generator_test.cpp b/core/test/utils/value_generator_test.cpp index c65cab1cce3..61cd4d7f809 100644 --- a/core/test/utils/value_generator_test.cpp +++ b/core/test/utils/value_generator_test.cpp @@ -59,8 +59,8 @@ class ValueGenerator : public ::testing::Test { InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + double res = 0; + int num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); res += pow(closure_op(tmp) - c, n); diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 4c972d2a584..e9b3e1bd954 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -105,6 +105,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") target_compile_options(ginkgo_cuda PRIVATE $<$:--extended-lambda>) + target_compile_options(ginkgo_cuda PRIVATE -Xcompiler="/bigobj") else() target_compile_options(ginkgo_cuda PRIVATE diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu index 7729d006b75..23fe80c2a1b 100644 --- a/cuda/base/batch_multi_vector_kernels.cu +++ b/cuda/base/batch_multi_vector_kernels.cu @@ -41,6 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "cuda/base/types.hpp" + + #include "core/base/batch_struct.hpp" #include "cuda/base/batch_struct.hpp" #include "cuda/base/config.hpp" diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index 429481ec9b6..d53af925df0 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -53,6 +53,17 @@ namespace cuda { * @ingroup curand */ namespace curand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(CURAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -101,6 +112,8 @@ GKO_BIND_CURAND_RANDOM_VECTOR(float, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(double, curandGenerateNormalDouble); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormalDouble); +template +GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_CURAND_RANDOM_VECTOR diff --git a/cuda/base/types.cpp b/cuda/base/types.cpp new file mode 100644 index 00000000000..130f8baca57 --- /dev/null +++ b/cuda/base/types.cpp @@ -0,0 +1,33 @@ +#include "cuda/base/types.hpp" + + +#if defined(__CUDACC__) + +#define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ + __device__ __forceinline__ __nv_bfloat16 operator _op( \ + const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + return static_cast<__nv_bfloat16>(static_cast(lhs) \ + _op static_cast(rhs)); \ + } \ + __device__ __forceinline__ __nv_bfloat16& operator _opeq( \ + __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + lhs = static_cast(lhs) _op static_cast(rhs); \ + return lhs; \ + } +BFLOAT_FRIEND_OPERATOR(+, +=) +BFLOAT_FRIEND_OPERATOR(-, -=) +BFLOAT_FRIEND_OPERATOR(*, *=) +BFLOAT_FRIEND_OPERATOR(/, /=) + +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) +{ + return h; +} +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) +{ + return -float{h}; +} + +#endif \ No newline at end of file diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 20dbccbe785..c3e50d7e4a5 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -42,24 +42,238 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include +#include #include +// #if defined(__CUDACC__) + +// #define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ +// __forceinline__ __device__ __nv_bfloat16 operator _op( \ +// const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ +// { \ +// return static_cast<__nv_bfloat16>(static_cast(lhs) \ +// _op static_cast(rhs)); \ +// } \ +// __forceinline__ __device__ __nv_bfloat16& operator _opeq( \ +// __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ +// { \ +// lhs = static_cast(lhs) _op static_cast(rhs); \ +// return lhs; \ +// } +// BFLOAT_FRIEND_OPERATOR(+, +=) +// BFLOAT_FRIEND_OPERATOR(-, -=) +// BFLOAT_FRIEND_OPERATOR(*, *=) +// BFLOAT_FRIEND_OPERATOR(/, /=) + +// __forceinline__ __device__ __nv_bfloat16 operator+(const __nv_bfloat16& h) +// { +// return h; +// } +// __forceinline__ __device__ __nv_bfloat16 operator-(const __nv_bfloat16& h) +// { +// return -float{h}; +// } +// #undef BFLOAT_FRIEND_OPERATOR + +// #endif + + +// thrust calls the c function not the function from std +// Maybe override the function from thrust directlry +GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( + thrust::complex<__half> a) +{ + return sqrt(static_cast>(a)); +} + +GKO_ATTRIBUTES GKO_INLINE __nv_bfloat16 hypot(__nv_bfloat16 a, __nv_bfloat16 b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__nv_bfloat16> sqrt( + thrust::complex<__nv_bfloat16> a) +{ + return sqrt(static_cast>(a)); +} + + +namespace thrust { + + +// Dircetly call float versrion from here? +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return abs(static_cast>(z)); +} + +template <> +GKO_ATTRIBUTES GKO_INLINE __nv_bfloat16 +abs<__nv_bfloat16>(const complex<__nv_bfloat16>& z) +{ + return abs(static_cast>(z)); +} + + +} // namespace thrust + + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ + } \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__nv_bfloat16> operator _op( \ + const thrust::complex<__nv_bfloat16>& lhs, \ + const thrust::complex<__nv_bfloat16>& rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) + + namespace gko { +// It is required by NVHPC 23.3, isnan is undefined when NVHPC are only as host +// compiler. +#ifdef __CUDACC__ + +// from the cuda_fp16.hpp +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + + +template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return __hisnan(val); +} + +template <> +__device__ __forceinline__ bool is_nan(const __nv_bfloat16& val) +{ + return isnan(static_cast(val)); +} + + +#else + + +template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return isnan(static_cast(val)); +} + +template <> +__device__ __forceinline__ bool is_nan(const __nv_bfloat16& val) +{ + return isnan(static_cast(val)); +} + + +#endif + + +template <> +__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + +template <> +__device__ __forceinline__ bool is_nan( + const thrust::complex<__nv_bfloat16>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + + +#endif + namespace kernels { namespace cuda { -namespace detail { +#ifdef __CUDACC__ +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +#if CUDA_VERSION >= 10020 +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + +__device__ __forceinline__ __nv_bfloat16 abs(const __nv_bfloat16& val) +{ + return abs(static_cast(val)); +} +#else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} + +__device__ __forceinline__ __nv_bfloat16 abs(const __nv_bfloat16& val) +{ + return abs(static_cast(val)); +} +#endif + + +__device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } + +__device__ __forceinline__ __nv_bfloat16 sqrt(const __nv_bfloat16& val) +{ + return sqrt(static_cast(val)); +} + + +#else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} + +__device__ __forceinline__ __nv_bfloat16 abs(const __nv_bfloat16& val) +{ + return abs(static_cast(val)); +} + + +__device__ __forceinline__ __half sqrt(const __half& val) +{ + return sqrt(static_cast(val)); +} + +__device__ __forceinline__ __nv_bfloat16 sqrt(const __nv_bfloat16& val) +{ + return sqrt(static_cast(val)); +} + + +#endif +#endif + +namespace detail { + /** * @internal * @@ -156,6 +370,27 @@ struct culibs_type_impl> { using type = cuDoubleComplex; }; + +template <> +struct culibs_type_impl { + using type = __half; +}; + +template <> +struct culibs_type_impl { + using type = __nv_bfloat16; +}; + +template <> +struct culibs_type_impl> { + using type = __half2; +}; + +template <> +struct culibs_type_impl> { + using type = __nv_bfloat162; +}; + template struct culibs_type_impl> { using type = typename culibs_type_impl>::type; @@ -186,9 +421,19 @@ struct cuda_type_impl { using type = volatile typename cuda_type_impl::type; }; +template <> +struct cuda_type_impl { + using type = __half; +}; + +template <> +struct cuda_type_impl { + using type = __nv_bfloat16; +}; + template struct cuda_type_impl> { - using type = thrust::complex; + using type = thrust::complex::type>; }; template <> @@ -201,6 +446,16 @@ struct cuda_type_impl { using type = thrust::complex; }; +template <> +struct cuda_type_impl<__half2> { + using type = thrust::complex<__half>; +}; + +template <> +struct cuda_type_impl<__nv_bfloat162> { + using type = thrust::complex<__nv_bfloat16>; +}; + template struct cuda_struct_member_type_impl { using type = T; @@ -208,7 +463,17 @@ struct cuda_struct_member_type_impl { template struct cuda_struct_member_type_impl> { - using type = fake_complex; + using type = fake_complex::type>; +}; + +template <> +struct cuda_struct_member_type_impl { + using type = __half; +}; + +template <> +struct cuda_struct_member_type_impl { + using type = __nv_bfloat16; }; template @@ -228,10 +493,13 @@ struct cuda_data_type_impl {}; } GKO_CUDA_DATA_TYPE(float16, CUDA_R_16F); +GKO_CUDA_DATA_TYPE(bfloat16, CUDA_R_16BF); GKO_CUDA_DATA_TYPE(float, CUDA_R_32F); GKO_CUDA_DATA_TYPE(double, CUDA_R_64F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_32F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_64F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_16F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_16BF); GKO_CUDA_DATA_TYPE(int32, CUDA_R_32I); GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I); diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index db59a47658d..0cd2e9688a1 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -332,7 +332,7 @@ public: SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -352,12 +352,12 @@ private: shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, - "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, + "Unable to shuffle sizes which are not 2-byte multiples"); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/cuda/distributed/matrix_kernels.cu b/cuda/distributed/matrix_kernels.cu index b1f5558d69e..f14bee8bbd3 100644 --- a/cuda/distributed/matrix_kernels.cu +++ b/cuda/distributed/matrix_kernels.cu @@ -50,6 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/thrust.cuh" +#include "cuda/base/types.hpp" #include "cuda/components/atomic.cuh" diff --git a/cuda/factorization/cholesky_kernels.cu b/cuda/factorization/cholesky_kernels.cu index 30fd249530b..076ed9a2546 100644 --- a/cuda/factorization/cholesky_kernels.cu +++ b/cuda/factorization/cholesky_kernels.cu @@ -56,6 +56,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/cusparse_bindings.hpp" #include "cuda/base/math.hpp" #include "cuda/base/thrust.cuh" +#include "cuda/base/types.hpp" #include "cuda/components/cooperative_groups.cuh" #include "cuda/components/intrinsics.cuh" #include "cuda/components/reduction.cuh" diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index 124a4deda75..7b20236827e 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -122,10 +122,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu index 31a679df019..8d1e32335cf 100644 --- a/cuda/matrix/fft_kernels.cu +++ b/cuda/matrix/fft_kernels.cu @@ -151,7 +151,7 @@ void fft(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -167,7 +167,8 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -183,7 +184,8 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/cuda/solver/cb_gmres_kernels.cu b/cuda/solver/cb_gmres_kernels.cu index 93e791c76e8..796daf39672 100644 --- a/cuda/solver/cb_gmres_kernels.cu +++ b/cuda/solver/cb_gmres_kernels.cu @@ -113,7 +113,8 @@ void initialize(std::shared_ptr exec, as_device_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 6ee2c7521ff..b8595eee9b5 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -240,14 +240,15 @@ struct CudaSolveStruct : gko::solver::SolveStruct { policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; size_type work_size{}; - + // TODO: In nullptr is considered nullptr_t not casted to const + // it does not work in cuda110/100 images cusparse::buffer_size_ext( handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - &work_size); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -257,8 +258,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - work.get_data()); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, work.get_data()); } void solve(const matrix::Csr* matrix, @@ -484,7 +485,8 @@ __global__ void sptrsv_naive_legacy_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - ValueType sum = 0.0; + // no constructor from double to thrust<__half> + ValueType sum = zero(); auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { @@ -538,7 +540,7 @@ void sptrsv_naive_caching(std::shared_ptr exec, const auto nrhs = b->get_size()[1]; // Initialize x to all NaNs. - dense::fill(exec, x, nan()); + dense::fill(exec, x, ValueType(nan())); array nan_produced(exec, 1); array atomic_counter(exec, 1); diff --git a/dev_tools/scripts/gdb-ginkgo.py b/dev_tools/scripts/gdb-ginkgo.py index c028e72994e..2c52af6452f 100644 --- a/dev_tools/scripts/gdb-ginkgo.py +++ b/dev_tools/scripts/gdb-ginkgo.py @@ -51,6 +51,7 @@ def next(self): _versioned_namespace = '__8::' + # new version adapted from https://gcc.gnu.org/pipermail/gcc-cvs/2021-November/356230.html # necessary due to empty class optimization def is_specialization_of(x, template_name): @@ -64,6 +65,7 @@ def is_specialization_of(x, template_name): expr = '^std::{}<.*>$'.format(template_name) return re.match(expr, x) is not None + def get_template_arg_list(type_obj): "Return a type's template arguments as a list" n = 0 @@ -75,6 +77,7 @@ def get_template_arg_list(type_obj): return template_args n += 1 + def _tuple_impl_get(val): "Return the tuple element stored in a _Tuple_impl base class." bases = val.type.fields() @@ -95,6 +98,7 @@ def _tuple_impl_get(val): else: raise ValueError("Unsupported implementation for std::tuple: %s" % str(val.type)) + def tuple_get(n, val): "Return the result of std::get(val) on a std::tuple" tuple_size = len(get_template_arg_list(val.type)) @@ -108,6 +112,7 @@ def tuple_get(n, val): n -= 1 return _tuple_impl_get(node) + def get_unique_ptr_data_ptr(val): "Return the result of val.get() on a std::unique_ptr" # std::unique_ptr contains a std::tuple, @@ -219,13 +224,37 @@ def display_hint(self): return 'array' -def lookup_type(val): - if not str(val.type.unqualified()).startswith('gko::'): +class GkoHalfPrinter: + "Print a gko::half" + + def __init__(self, val): + # GDB doesn't seem to consider the user-defined conversion in its Value.cast, + # so we need to call the conversion operator explicitly + address = hex(val.address) + self.float_val = gdb.parse_and_eval(f"reinterpret_cast({address})->operator float()") + + def to_string(self): + self.float_val.fetch_lazy() + return self.float_val + + +def create_printer(val, type_suffix, type_printer): + val_type = gdb.types.get_basic_type(val.type) + if not str(val_type).startswith('gko::'): return None - suffix = str(val.type.unqualified())[5:] - if suffix.startswith('array'): - return GkoArrayPrinter(val) + suffix = str(val_type)[5:] + if suffix.startswith(type_suffix): + return type_printer(val) return None -gdb.pretty_printers.append(lookup_type) +def gko_array(val): + return create_printer(val, 'array', GkoArrayPrinter) + + +def gko_half(val): + return create_printer(val, 'half', GkoHalfPrinter) + + +gdb.pretty_printers.append(gko_array) +gdb.pretty_printers.append(gko_half) diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp index d936f78aa94..44b55e3e6bb 100644 --- a/dpcpp/components/atomic.dp.hpp +++ b/dpcpp/components/atomic.dp.hpp @@ -175,6 +175,21 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); +// sycl does not support 16bit +template +struct atomic_helper> { + __dpct_inline__ static ValueType atomic_add(ValueType* __restrict__ addr, + ValueType val) + { + // GKO_NOT_IMPLEMENTED; + // wrong implementation because sycl can not use exception in kernel + auto old = *addr; + *addr += val; + return old; + } +}; + #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE @@ -242,7 +257,20 @@ struct atomic_helper< GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int); - +// not support 16bit +template +struct atomic_max_helper> { + __dpct_inline__ static ValueType atomic_max(ValueType* __restrict__ addr, + ValueType val) + { + // GKO_NOT_IMPLEMENTED; + // wrong implementation because sycl can not use exception in kernel + auto old = *addr; + *addr = std::max(*addr, val); + return old; + } +}; #undef GKO_BIND_ATOMIC_MAX_STRUCTURE diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc index 41fa99cc24e..10da0115223 100644 --- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc @@ -372,7 +372,7 @@ void basecase_select(const ValueType* __restrict__ input, IndexType size, for (int i = 0; i < basecase_local_size; ++i) { auto idx = item_ct1.get_local_id(2) + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local, item_ct1); if (item_ct1.get_local_id(2) == rank / basecase_local_size) { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 11309b67b9b..2ae9022edf9 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -62,6 +62,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" #include "dpcpp/components/atomic.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" @@ -297,7 +298,7 @@ void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const arithmetic_type scale_factor = static_cast(alpha[0]); spmv_kernel( nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { @@ -512,8 +513,8 @@ void abstract_merge_path_spmv( sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); merge_path_spmv( num_rows, val, col_idxs, row_ptrs, srow, b, c, row_out, val_out, [&alpha_val](const type& x) { return alpha_val * x; }, @@ -604,7 +605,7 @@ void abstract_reduce( uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { - const arithmetic_type alpha_val = alpha[0]; + const arithmetic_type alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }, @@ -704,13 +705,13 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(subgroup_size)]] { - abstract_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, - c, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -735,8 +736,8 @@ void abstract_classical_spmv( acc::range c, sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); device_classical_spmv( num_rows, val, col_idxs, row_ptrs, b, c, [&alpha_val, &beta_val](const type& x, const type& y) { @@ -758,13 +759,14 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(subgroup_size)]] { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, - row_ptrs, b, beta, c, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, + item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -1245,8 +1247,11 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType host_beta, matrix::Dense* c) { - bool try_sparselib = !is_complex(); - if (try_sparselib) { + constexpr bool try_sparselib = + !is_complex() && + !std::is_same::value && + !std::is_same::value; + if constexpr (try_sparselib) { oneapi::mkl::sparse::matrix_handle_t mat_handle; oneapi::mkl::sparse::init_matrix_handle(&mat_handle); oneapi::mkl::sparse::set_csr_data( diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index fba518f387b..a73bb226f3b 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -250,17 +250,21 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - one(), a->get_const_values(), a->get_stride(), - b->get_const_values(), b->get_stride(), zero(), - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); + if constexpr (onemkl::is_supported::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], one(), a->get_const_values(), + a->get_stride(), b->get_const_values(), b->get_stride(), + zero(), c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } } + } else { + GKO_NOT_IMPLEMENTED; } } @@ -274,19 +278,24 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - exec->copy_val_to_host(alpha->get_const_values()), - a->get_const_values(), a->get_stride(), b->get_const_values(), - b->get_stride(), - exec->copy_val_to_host(beta->get_const_values()), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); + if constexpr (onemkl::is_supported::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), + b->get_const_values(), b->get_stride(), + exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } } + } else { + GKO_NOT_IMPLEMENTED; } } diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 65fad771140..4817b9a5991 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -120,16 +120,17 @@ void spmv_kernel( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { + using arithmetic_type = typename a_accessor::arithmetic_type; const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); if (num_thread_per_worker == 1) { // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { - auto temp = zero(); + auto temp = zero(); for (size_type idx = 0; idx < num_stored_elements_per_row; idx++) { const auto ind = tidx + idx * stride; const auto col_idx = col[ind]; @@ -150,11 +151,11 @@ void spmv_kernel( const auto step_size = num_worker_per_row * num_thread_per_worker; if (runnable && idx_in_worker == 0) { - storage[item_ct1.get_local_id(2)] = 0; + storage[item_ct1.get_local_id(2)] = zero(); } item_ct1.barrier(sycl::access::fence_space::local_space); - auto temp = zero(); + auto temp = zero(); if (runnable) { for (size_type idx = worker_id * num_thread_per_worker + idx_in_worker; @@ -193,13 +194,15 @@ void spmv( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [](const OutputValueType& x, const OutputValueType& y) { return x; }, + [](const auto& x, const OutputValueType& y) { + return static_cast(x); + }, item_ct1, storage); } @@ -214,7 +217,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); @@ -239,10 +242,11 @@ void spmv( const size_type num_stored_elements_per_row, acc::range b, const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, const size_type c_stride, sycl::nd_item<3> item_ct1, - uninitialized_array& storage) { - const OutputValueType alpha_val = alpha(0); + using arithmetic_type = typename a_accessor::arithmetic_type; + const auto alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; if (atomic) { // Because the atomic operation changes the values of c during @@ -253,17 +257,17 @@ void spmv( spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val](const OutputValueType& x, const OutputValueType& y) { - return alpha_val * x; + [&alpha_val](const auto& x, const OutputValueType& y) { + return static_cast(alpha_val * x); }, item_ct1, storage); } else { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, - [&alpha_val, &beta_val](const OutputValueType& x, - const OutputValueType& y) { - return alpha_val * x + beta_val * y; + [&alpha_val, &beta_val](const auto& x, const OutputValueType& y) { + return static_cast( + alpha_val * x + static_cast(beta_val * y)); }, item_ct1, storage); } @@ -281,7 +285,7 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { sycl::accessor< - uninitialized_array, 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); @@ -316,10 +320,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - gko::acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + gko::acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - gko::acc::reduced_row_major<2, OutputValueType, const InputValueType>; + gko::acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 9630b8dcb91..fa93b55a903 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -980,7 +980,8 @@ void initialize(std::shared_ptr exec, stop_status->get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 3e7b5737f0f..a9f5aa5b3c1 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -636,7 +636,9 @@ void initialize_subspace_vectors(std::shared_ptr exec, cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); - oneapi::dpl::normal_distribution> + oneapi::dpl::normal_distribution< + typename ::gko::detail::arth_type< + remove_complex>::type> distr(0, 1); auto res = distr(engine); diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index aae15245357..f869a1b05ed 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -434,7 +434,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine); + std::normal_distribution(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 14e144f6d84..4fb0703443b 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -58,6 +58,17 @@ namespace hip { * @ingroup hiprand */ namespace hiprand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(HIPRAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -107,6 +118,8 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(double, hiprandGenerateNormalDouble); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormal); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormalDouble); +template +GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_HIPRAND_RANDOM_VECTOR diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index c886378ec80..5f9943a4c5a 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -51,11 +52,154 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include +// thrust calls the c function not the function from std +// Maybe override the function from thrust directlry +__device__ __forceinline__ __half hypot(__half a, __half b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +__device__ __forceinline__ thrust::complex<__half> sqrt( + thrust::complex<__half> a) +{ + return sqrt(static_cast>(a)); +} + +__device__ __forceinline__ hip_bfloat16 hypot(hip_bfloat16 a, hip_bfloat16 b) +{ + return static_cast( + hypot(static_cast(a), static_cast(b))); +} + +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex a) +{ + return sqrt(static_cast>(a)); +} + +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) +{ + return thrust::sqrt(val); +} +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) +{ + return thrust::sqrt(val); +} + +#if GINKGO_HIP_PLATFORM_NVCC && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 +__device__ __forceinline__ __half sqrt(__half val) +{ + return sqrt(static_cast(val)); +} + +__device__ __forceinline__ hip_bfloat16 sqrt(hip_bfloat16 val) +{ + return sqrt(static_cast(val)); +} +#else +__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } +__device__ __forceinline__ hip_bfloat16 sqrt(hip_bfloat16 val) +{ + return static_cast(sqrt(static_cast(val))); +} +#endif + + +namespace thrust { + + +// Dircetly call float versrion from here? +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return hypot(static_cast(z.real()), static_cast(z.imag())); +} + +template <> +GKO_ATTRIBUTES GKO_INLINE hip_bfloat16 +abs(const complex& z) +{ + return static_cast( + hypot(static_cast(z.real()), static_cast(z.imag()))); +} + + +} // namespace thrust + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ + } \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex operator _op( \ + const thrust::complex lhs, \ + const thrust::complex rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) + + namespace gko { +#if GINKGO_HIP_PLATFORM_NVCC +// from the cuda_fp16.hpp +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return __hisnan(val); +} +#if CUDA_VERSION >= 10020 +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } +#else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif +#else +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return is_nan(static_cast(val)); +} + +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif + +#else // Not nvidia device +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return __hisnan(val); +} + +// rocm40 __habs is not constexpr +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + +#endif + +__device__ __forceinline__ bool is_nan(const hip_bfloat16& val) +{ + return is_nan(static_cast(val)); +} + +__device__ __forceinline__ hip_bfloat16 abs(const hip_bfloat16& val) +{ + return static_cast(abs(static_cast(val))); +} namespace kernels { namespace hip { @@ -158,6 +302,27 @@ struct hiplibs_type_impl> { using type = hipDoubleComplex; }; +template <> +struct hiplibs_type_impl { + using type = __half; +}; + +template <> +struct hiplibs_type_impl> { + using type = __half2; +}; + +template <> +struct hiplibs_type_impl { + using type = hip_bfloat16; +}; + +template <> +struct hiplibs_type_impl> { + // TODO: HIP does not support it. + using type = __half2; +}; + template struct hiplibs_type_impl> { using type = typename hiplibs_type_impl>::type; @@ -230,9 +395,19 @@ struct hip_type_impl { using type = volatile typename hip_type_impl::type; }; +template <> +struct hip_type_impl { + using type = __half; +}; + +template <> +struct hip_type_impl { + using type = hip_bfloat16; +}; + template struct hip_type_impl> { - using type = thrust::complex; + using type = thrust::complex::type>; }; template <> @@ -245,6 +420,13 @@ struct hip_type_impl { using type = thrust::complex; }; +template <> +struct hip_type_impl<__half2> { + using type = thrust::complex<__half>; +}; + +// TODO: hip does not support hip_bfloat162 + template struct hip_struct_member_type_impl { using type = T; @@ -252,7 +434,17 @@ struct hip_struct_member_type_impl { template struct hip_struct_member_type_impl> { - using type = fake_complex; + using type = fake_complex::type>; +}; + +template <> +struct hip_struct_member_type_impl { + using type = __half; +}; + +template <> +struct hip_struct_member_type_impl { + using type = hip_bfloat16; }; template diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 647a6f9bc22..f7bc45c087b 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -335,7 +335,7 @@ class enable_extended_shuffle : public Group { SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -355,12 +355,12 @@ class enable_extended_shuffle : public Group { shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/hip/matrix/csr_kernels.instantiate.hip.cpp b/hip/matrix/csr_kernels.instantiate.hip.cpp index 9a6c29206de..a0cb622eeca 100644 --- a/hip/matrix/csr_kernels.instantiate.hip.cpp +++ b/hip/matrix/csr_kernels.instantiate.hip.cpp @@ -62,6 +62,18 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, int64); // split @@ -73,6 +85,18 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); // split @@ -88,6 +112,18 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split @@ -99,6 +135,18 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT7( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT8( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index db9d5aa11bb..1567548463f 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -125,10 +125,12 @@ void abstract_spmv(syn::value_list, const matrix::Dense* alpha = nullptr, const matrix::Dense* beta = nullptr) { + using arithmetic_type = + highest_precision; using a_accessor = - acc::reduced_row_major<1, OutputValueType, const MatrixValueType>; + acc::reduced_row_major<1, arithmetic_type, const MatrixValueType>; using b_accessor = - acc::reduced_row_major<2, OutputValueType, const InputValueType>; + acc::reduced_row_major<2, arithmetic_type, const InputValueType>; const auto nrows = a->get_size()[0]; const auto stride = a->get_stride(); diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index 56c967d9e49..e793663c6b8 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -191,7 +191,7 @@ void fft(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -207,7 +207,8 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -223,7 +224,8 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/hip/solver/cb_gmres_kernels.hip.cpp b/hip/solver/cb_gmres_kernels.hip.cpp index d47b53f2dfa..7664c456396 100644 --- a/hip/solver/cb_gmres_kernels.hip.cpp +++ b/hip/solver/cb_gmres_kernels.hip.cpp @@ -115,7 +115,8 @@ void initialize(std::shared_ptr exec, as_device_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp index c10544394e3..a8611beddea 100644 --- a/hip/test/matrix/fbcsr_kernels.cpp +++ b/hip/test/matrix/fbcsr_kernels.cpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -73,7 +74,7 @@ class Fbcsr : public HipTestFixture { std::unique_ptr rsorted_ref; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -177,11 +178,16 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value || + std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -201,11 +207,16 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value || + std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -237,11 +248,17 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value || + std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -273,11 +290,17 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value || + std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 720b8c8a45d..ca2aa30e6f0 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -133,6 +133,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@ // clang-format on +/* Is half operation available ? */ +// clang-format off +#cmakedefine01 GINKGO_ENABLE_HALF +// clang-format on + /* Do we need to use blocking communication in our SpMV? */ // clang-format off diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index 1140f1e400c..b52f13d08d8 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index d91274526d3..f44eb81ee9b 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -81,11 +81,16 @@ class MultiVector : public EnablePolymorphicObject>, public EnablePolymorphicAssignment>, public EnableCreateMethod>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + MultiVector>>>, + public ConvertibleTo>>, +#endif public ConvertibleTo>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class MultiVector>; - friend class MultiVector>; + friend class MultiVector>; public: using EnablePolymorphicAssignment::convert_to; @@ -113,6 +118,29 @@ class MultiVector void move_to(MultiVector>* result) override; +#if GINKGO_ENABLE_HALF + friend class MultiVector>>; + using ConvertibleTo< + MultiVector>>>::convert_to; + using ConvertibleTo< + MultiVector>>>::move_to; + + void convert_to(MultiVector>>* + result) const override; + + void move_to(MultiVector>>* result) + override; + + friend class MultiVector>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + MultiVector>* result) const override; + + void move_to(MultiVector>* result) override; +#endif + /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch * MultiVector object. Does not perform any deep copies, but only returns a @@ -430,7 +458,7 @@ class MultiVector private: batch_dim<2> batch_size_; array values_; -}; +}; // namespace batch /** diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index c70c5f054ec..b42bad41d54 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 8b270ed7a98..e3885135b11 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index c7195501178..e18984fb866 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -48,6 +48,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp new file mode 100644 index 00000000000..f6d9fcc320a --- /dev/null +++ b/include/ginkgo/core/base/half.hpp @@ -0,0 +1,1182 @@ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_HALF_HPP_ +#define GKO_PUBLIC_CORE_BASE_HALF_HPP_ + + +#include +#include + + +#include +#include + + +#ifdef __CUDA_ARCH__ + +#include +#include + +#if defined(__CUDACC__) && (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)) +#define BFLOAT_FRIEND_OPERATOR(_op, _opeq) \ + __forceinline__ __device__ __nv_bfloat16 operator _op( \ + const __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + return static_cast<__nv_bfloat16>(static_cast(lhs) \ + _op static_cast(rhs)); \ + } \ + __forceinline__ __device__ __nv_bfloat16& operator _opeq( \ + __nv_bfloat16& lhs, const __nv_bfloat16& rhs) \ + { \ + lhs = static_cast(lhs) _op static_cast(rhs); \ + return lhs; \ + } +BFLOAT_FRIEND_OPERATOR(+, +=) +BFLOAT_FRIEND_OPERATOR(-, -=) +BFLOAT_FRIEND_OPERATOR(*, *=) +BFLOAT_FRIEND_OPERATOR(/, /=) + +__forceinline__ __device__ __nv_bfloat16 operator+(const __nv_bfloat16& h) +{ + return h; +} +__forceinline__ __device__ __nv_bfloat16 operator-(const __nv_bfloat16& h) +{ + return -float{h}; +} +#undef BFLOAT_FRIEND_OPERATOR + +#endif + +class hip_bfloat16; + + +#elif defined(__HIP_DEVICE_COMPILE__) + + +#include +#include +class __nv_bfloat16; + +#else + + +class __half; +class __nv_bfloat16; +class hip_bfloat16; + + +#endif // __CUDA_ARCH__ + + +namespace gko { + + +template +class truncated; + +class bfloat16; + + +namespace detail { + + +template +struct uint_of_impl {}; + +template +struct uint_of_impl> { + using type = uint16; +}; + +template +struct uint_of_impl> { + using type = uint32; +}; + +template +struct uint_of_impl> { + using type = uint64; +}; + +template +using uint_of = typename uint_of_impl::type; + + +template +struct basic_float_traits {}; + +template <> +struct basic_float_traits { + using type = float16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits { + using type = bfloat16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 7; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits<__nv_bfloat16> { + using type = __nv_bfloat16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 7; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits { + using type = hip_bfloat16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 7; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; +// #endif + +template <> +struct basic_float_traits { + using type = float32; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 23; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits { + using type = float64; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 52; + static constexpr int exponent_bits = 11; + static constexpr bool rounds_to_nearest = true; +}; + +template +struct basic_float_traits> { + using type = truncated; + static constexpr int sign_bits = ComponentId == 0 ? 1 : 0; + static constexpr int exponent_bits = + ComponentId == 0 ? basic_float_traits::exponent_bits : 0; + static constexpr int significand_bits = + ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1 + : sizeof(type) * byte_size; + static constexpr bool rounds_to_nearest = false; +}; + + +template +constexpr UintType create_ones(int n) +{ + return (n == sizeof(UintType) * byte_size ? static_cast(0) + : static_cast(1) << n) - + static_cast(1); +} + +template +struct float_traits { + using type = typename basic_float_traits::type; + using bits_type = uint_of; + static constexpr int sign_bits = basic_float_traits::sign_bits; + static constexpr int significand_bits = + basic_float_traits::significand_bits; + static constexpr int exponent_bits = basic_float_traits::exponent_bits; + static constexpr bits_type significand_mask = + create_ones(significand_bits); + static constexpr bits_type exponent_mask = + create_ones(significand_bits + exponent_bits) - + significand_mask; + static constexpr bits_type bias_mask = + create_ones(significand_bits + exponent_bits - 1) - + significand_mask; + static constexpr bits_type sign_mask = + create_ones(sign_bits + significand_bits + exponent_bits) - + exponent_mask - significand_mask; + static constexpr bool rounds_to_nearest = + basic_float_traits::rounds_to_nearest; + + static constexpr auto eps = + 1.0 / (1ll << (significand_bits + rounds_to_nearest)); + + static constexpr bool is_inf(bits_type data) + { + return (data & exponent_mask) == exponent_mask && + (data & significand_mask) == bits_type{}; + } + + static constexpr bool is_nan(bits_type data) + { + return (data & exponent_mask) == exponent_mask && + (data & significand_mask) != bits_type{}; + } + + static constexpr bool is_denom(bits_type data) + { + return (data & exponent_mask) == bits_type{}; + } +}; + + +template +struct precision_converter; + +// upcasting implementation details +template +struct precision_converter { + using source_traits = float_traits; + using result_traits = float_traits; + using source_bits = typename source_traits::bits_type; + using result_bits = typename result_traits::bits_type; + + static_assert(source_traits::exponent_bits <= + result_traits::exponent_bits && + source_traits::significand_bits <= + result_traits::significand_bits, + "SourceType has to have both lower range and precision or " + "higher range and precision than ResultType"); + + static constexpr int significand_offset = + result_traits::significand_bits - source_traits::significand_bits; + static constexpr int exponent_offset = significand_offset; + static constexpr int sign_offset = result_traits::exponent_bits - + source_traits::exponent_bits + + exponent_offset; + static constexpr result_bits bias_change = + result_traits::bias_mask - + (static_cast(source_traits::bias_mask) << exponent_offset); + + static constexpr result_bits shift_significand(source_bits data) noexcept + { + return static_cast(data & source_traits::significand_mask) + << significand_offset; + } + + static constexpr result_bits shift_exponent(source_bits data) noexcept + { + return update_bias( + static_cast(data & source_traits::exponent_mask) + << exponent_offset); + } + + static constexpr result_bits shift_sign(source_bits data) noexcept + { + return static_cast(data & source_traits::sign_mask) + << sign_offset; + } + +private: + static constexpr result_bits update_bias(result_bits data) noexcept + { + return data == typename result_traits::bits_type{} ? data + : data + bias_change; + } +}; + +// downcasting implementation details +template +struct precision_converter { + using source_traits = float_traits; + using result_traits = float_traits; + using source_bits = typename source_traits::bits_type; + using result_bits = typename result_traits::bits_type; + + static_assert(source_traits::exponent_bits >= + result_traits::exponent_bits && + source_traits::significand_bits >= + result_traits::significand_bits, + "SourceType has to have both lower range and precision or " + "higher range and precision than ResultType"); + + static constexpr int significand_offset = + source_traits::significand_bits - result_traits::significand_bits; + static constexpr int exponent_offset = significand_offset; + static constexpr int sign_offset = source_traits::exponent_bits - + result_traits::exponent_bits + + exponent_offset; + static constexpr source_bits bias_change = + (source_traits::bias_mask >> exponent_offset) - + static_cast(result_traits::bias_mask); + + static constexpr result_bits shift_significand(source_bits data) noexcept + { + return static_cast( + (data & source_traits::significand_mask) >> significand_offset); + } + + static constexpr result_bits shift_exponent(source_bits data) noexcept + { + return static_cast(update_bias( + (data & source_traits::exponent_mask) >> exponent_offset)); + } + + static constexpr result_bits shift_sign(source_bits data) noexcept + { + return static_cast((data & source_traits::sign_mask) >> + sign_offset); + } + +private: + static constexpr source_bits update_bias(source_bits data) noexcept + { + return data <= bias_change ? typename source_traits::bits_type{} + : limit_exponent(data - bias_change); + } + + static constexpr source_bits limit_exponent(source_bits data) noexcept + { + return data >= static_cast(result_traits::exponent_mask) + ? static_cast(result_traits::exponent_mask) + : data; + } +}; + + +} // namespace detail + +/** + * A class providing basic support for half precision floating point types. + * + * For now the only features are reduced storage compared to single precision + * and conversions from and to single precision floating point type. + */ +class half { +public: + // TODO: NVHPC (host side) may not use zero initialzation for the data + // member by default constructor in some cases. Not sure whether it is + // caused by something else in jacobi or isai. + GKO_ATTRIBUTES half() noexcept : data_(0){}; + + template ::value>> + GKO_ATTRIBUTES half(const T val) + { + this->float2half(static_cast(val)); + } + + GKO_ATTRIBUTES half(const half& val) = default; + + inline GKO_ATTRIBUTES half(const bfloat16& val); + + template + GKO_ATTRIBUTES half& operator=(const V val) + { + this->float2half(static_cast(val)); + return *this; + } + + GKO_ATTRIBUTES operator float() const noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(reinterpret_cast(data_)); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto bits = half2float(data_); + return reinterpret_cast(bits); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + + // can not use half operator _op(const half) for half + half + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ + } + HALF_OPERATOR(+, +=) + HALF_OPERATOR(-, -=) + HALF_OPERATOR(*, *=) + HALF_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using half as type +#define HALF_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const half hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const T val, const half hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ + return result; \ + } + + HALF_FRIEND_OPERATOR(+, +=) + HALF_FRIEND_OPERATOR(-, -=) + HALF_FRIEND_OPERATOR(*, *=) + HALF_FRIEND_OPERATOR(/, /=) + + // the negative + GKO_ATTRIBUTES half operator-() const + { + auto val = 0.0f - *this; + return half(val); + } + +private: + using f16_traits = detail::float_traits; + using f32_traits = detail::float_traits; + + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. + GKO_ATTRIBUTES void float2half(float val) noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto tmp = __float2half_rn(val); + data_ = reinterpret_cast(tmp); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + data_ = float2half(reinterpret_cast(val)); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + + static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept + { + using conv = detail::precision_converter; + if (f32_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask; + } else if (f32_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask | + f16_traits::significand_mask; + } else { + const auto exp = conv::shift_exponent(data_); + if (f16_traits::is_inf(exp)) { + return conv::shift_sign(data_) | exp; + } else if (f16_traits::is_denom(exp)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + // Rounding to even + const auto result = conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + // return result + ((result & 1) && + // ((data_ >> (f32_traits::significand_bits - + // f16_traits::significand_bits - 1)) & + // 1)); + const auto tail = + data_ & static_cast( + (1 << conv::significand_offset) - 1); + + constexpr auto half = static_cast( + 1 << (conv::significand_offset - 1)); + return result + + (tail > half || ((tail == half) && (result & 1))); + } + } + } + + static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept + { + using conv = detail::precision_converter; + if (f16_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask; + } else if (f16_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask | + f32_traits::significand_mask; + } else if (f16_traits::is_denom(data_)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | conv::shift_exponent(data_) | + conv::shift_significand(data_); + } + } + + uint16 data_; +}; + + +class bfloat16 { +public: + GKO_ATTRIBUTES bfloat16() noexcept : data_(0) {} + + template ::value>> + GKO_ATTRIBUTES bfloat16(const T val) + { + this->float2bfloat16(static_cast(val)); + } + + GKO_ATTRIBUTES bfloat16(const bfloat16& val) = default; + + GKO_ATTRIBUTES bfloat16(const half& val) : bfloat16(static_cast(val)) + {} + + template + GKO_ATTRIBUTES bfloat16& operator=(const V val) + { + this->float2bfloat16(static_cast(val)); + return *this; + } + + GKO_ATTRIBUTES operator float() const noexcept + { + const auto bits = bfloat162float(data_); + return reinterpret_cast(bits); + } + + // can not use bfloat16 operator _op(const bfloat16) for bfloat16 + bfloat16 + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define bfloat16_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend bfloat16 operator _op(const bfloat16 lhf, \ + const bfloat16 rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES bfloat16& operator _opeq(const bfloat16& hf) \ + { \ + auto result = *this _op hf; \ + this->float2bfloat16(result); \ + return *this; \ + } + bfloat16_OPERATOR(+, +=) bfloat16_OPERATOR(-, -=) bfloat16_OPERATOR(*, *=) + bfloat16_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using bfloat16 as type +#define bfloat16_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + bfloat16>::type> \ + operator _op(const bfloat16 hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + bfloat16>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + bfloat16>::type> \ + operator _op(const T val, const bfloat16 hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + bfloat16>::type; \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ + return result; \ + } + + bfloat16_FRIEND_OPERATOR(+, +=) bfloat16_FRIEND_OPERATOR(-, -=) + bfloat16_FRIEND_OPERATOR(*, *=) bfloat16_FRIEND_OPERATOR(/, /=) + + // the negative + GKO_ATTRIBUTES bfloat16 + operator-() const + { + auto val = 0.0f - *this; + return bfloat16(val); + } + +private: + using f16_traits = detail::float_traits; + using f32_traits = detail::float_traits; + + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. + GKO_ATTRIBUTES void float2bfloat16(float val) noexcept + { + data_ = float2bfloat16(reinterpret_cast(val)); + } + + static GKO_ATTRIBUTES uint16 float2bfloat16(uint32 data_) noexcept + { + using conv = detail::precision_converter; + if (f32_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask; + } else if (f32_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask | + f16_traits::significand_mask; + } else { + const auto exp = conv::shift_exponent(data_); + if (f16_traits::is_inf(exp)) { + return conv::shift_sign(data_) | exp; + } else if (f16_traits::is_denom(exp)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + // Rounding to even + const auto result = conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + // return result + ((result & 1) && + // ((data_ >> (f32_traits::significand_bits - + // f16_traits::significand_bits - 1)) & + // 1)); + const auto tail = + data_ & static_cast( + (1 << conv::significand_offset) - 1); + + constexpr auto bfloat16 = static_cast( + 1 << (conv::significand_offset - 1)); + return result + (tail > bfloat16 || + ((tail == bfloat16) && (result & 1))); + } + } + } + + static GKO_ATTRIBUTES uint32 bfloat162float(uint16 data_) noexcept + { + using conv = detail::precision_converter; + if (f16_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask; + } else if (f16_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask | + f32_traits::significand_mask; + } else if (f16_traits::is_denom(data_)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | conv::shift_exponent(data_) | + conv::shift_significand(data_); + } + } + + uint16 data_; +}; + + +inline GKO_ATTRIBUTES half::half(const bfloat16& val) + : half(static_cast(val)) +{} + + +} // namespace gko + + +namespace std { + +template <> +class complex; + +template <> +class complex { +public: + using value_type = gko::half; + + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) + : real_(real), imag_(imag) + {} + template ::value && + std::is_scalar::value>> + explicit complex(const T& real, const U& imag) + : real_(static_cast(real)), + imag_(static_cast(imag)) + {} + + template ::value>> + complex(const T& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + + complex(const gko::bfloat16& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + + // When using complex(real, imag), MSVC with CUDA try to recognize the + // complex is a member not constructor. + template ::value>> + explicit complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) + {} + + explicit inline complex(const complex& other); + + // explicit complex(const complex& other) = default; + + value_type real() const noexcept { return real_; } + + value_type imag() const noexcept { return imag_; } + + + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } + + template + complex& operator=(const V& val) + { + real_ = val; + imag_ = value_type(); + return *this; + } + + template + complex& operator=(const std::complex& val) + { + real_ = val.real(); + imag_ = val.imag(); + return *this; + } + + complex& operator+=(const value_type& real) + { + real_ += real; + return *this; + } + complex& operator-=(const value_type& real) + { + real_ -= real; + return *this; + } + complex& operator*=(const value_type& real) + { + real_ *= real; + imag_ *= real; + return *this; + } + complex& operator/=(const value_type& real) + { + real_ /= real; + imag_ /= real; + return *this; + } + + template + complex& operator+=(const complex& val) + { + real_ += val.real(); + imag_ += val.imag(); + return *this; + } + template + complex& operator-=(const complex& val) + { + real_ -= val.real(); + imag_ -= val.imag(); + return *this; + } + template + complex& operator*=(const complex& val) + { + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f *= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + // auto tmp = real_; + // real_ = real_ * val.real() - imag_ * val.imag(); + // imag_ = tmp * val.imag() + imag_ * val.real(); + return *this; + } + template + complex& operator/=(const complex& val) + { + // auto real = val.real(); + // auto imag = val.imag(); + // (*this) *= complex{val.real(), -val.imag()}; + // (*this) /= (real * real + imag * imag); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f /= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + return *this; + } + +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// half +#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_HALF_OPERATOR(+, +=) + COMPLEX_HALF_OPERATOR(-, -=) + COMPLEX_HALF_OPERATOR(*, *=) + COMPLEX_HALF_OPERATOR(/, /=) + +private: + value_type real_; + value_type imag_; +}; + + +template <> +class complex { +public: + using value_type = gko::bfloat16; + + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) + : real_(real), imag_(imag) + {} + template ::value && + std::is_scalar::value>> + explicit complex(const T& real, const U& imag) + : real_(static_cast(real)), + imag_(static_cast(imag)) + {} + + template ::value>> + complex(const T& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + + complex(const gko::half& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) + {} + + // When using complex(real, imag), MSVC with CUDA try to recognize the + // complex is a member not constructor. + template ::value>> + explicit complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) + {} + + explicit complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) + {} + + // explicit complex(const complex& other) = default; + + value_type real() const noexcept { return real_; } + + value_type imag() const noexcept { return imag_; } + + + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } + + template + complex& operator=(const V& val) + { + real_ = val; + imag_ = value_type(); + return *this; + } + + template + complex& operator=(const std::complex& val) + { + real_ = val.real(); + imag_ = val.imag(); + return *this; + } + + complex& operator+=(const value_type& real) + { + real_ += real; + return *this; + } + complex& operator-=(const value_type& real) + { + real_ -= real; + return *this; + } + complex& operator*=(const value_type& real) + { + real_ *= real; + imag_ *= real; + return *this; + } + complex& operator/=(const value_type& real) + { + real_ /= real; + imag_ /= real; + return *this; + } + + template + complex& operator+=(const complex& val) + { + real_ += val.real(); + imag_ += val.imag(); + return *this; + } + template + complex& operator-=(const complex& val) + { + real_ -= val.real(); + imag_ -= val.imag(); + return *this; + } + template + complex& operator*=(const complex& val) + { + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f *= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + // auto tmp = real_; + // real_ = real_ * val.real() - imag_ * val.imag(); + // imag_ = tmp * val.imag() + imag_ * val.real(); + return *this; + } + template + complex& operator/=(const complex& val) + { + // auto real = val.real(); + // auto imag = val.imag(); + // (*this) *= complex{val.real(), -val.imag()}; + // (*this) /= (real * real + imag * imag); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f /= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + return *this; + } + +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// bfloat16 +#define COMPLEX_BFLOAT16_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_BFLOAT16_OPERATOR(+, +=) + COMPLEX_BFLOAT16_OPERATOR(-, -=) + COMPLEX_BFLOAT16_OPERATOR(*, *=) + COMPLEX_BFLOAT16_OPERATOR(/, /=) + +private: + value_type real_; + value_type imag_; +}; + + +inline complex::complex(const complex& other) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) +{} + + +template <> +struct numeric_limits { + static constexpr bool is_specialized{true}; + static constexpr bool is_signed{true}; + static constexpr bool is_integer{false}; + static constexpr bool is_exact{false}; + static constexpr bool is_bounded{true}; + static constexpr bool is_modulo{false}; + static constexpr int digits{ + gko::detail::float_traits::significand_bits + 1}; + // 3/10 is approx. log_10(2) + static constexpr int digits10{digits * 3 / 10}; + + // Note: gko::half can't return gko::half here because it does not have + // a constexpr constructor. + static constexpr float epsilon() + { + return gko::detail::float_traits::eps; + } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } +}; + + +template <> +struct numeric_limits { + static constexpr bool is_specialized{true}; + static constexpr bool is_signed{true}; + static constexpr bool is_integer{false}; + static constexpr bool is_exact{false}; + static constexpr bool is_bounded{true}; + static constexpr bool is_modulo{false}; + static constexpr int digits{ + gko::detail::float_traits::significand_bits + 1}; + // 3/10 is approx. log_10(2) + static constexpr int digits10{digits * 3 / 10}; + + // Note: gko::bfloat16 can't return gko::bfloat16 here because it does not + // have + // a constexpr constructor. + static constexpr float epsilon() + { + return gko::detail::float_traits::eps; + } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } +}; + + +// complex using a template on operator= for any kind of complex, so we can +// do full specialization for half +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + +// For MSVC +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + +} // namespace std + + +#endif // GKO_PUBLIC_CORE_BASE_HALF_HPP_ diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp index 281690b7807..647671378f7 100644 --- a/include/ginkgo/core/base/index_set.hpp +++ b/include/ginkgo/core/base/index_set.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp index 2366c824316..b9264c39030 100644 --- a/include/ginkgo/core/base/intrinsics.hpp +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index c06c43bbb6e..bdf78767e95 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 70e4db5bb2d..09fdef5bea4 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -43,10 +43,72 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include +class __half; +class __nv_bfloat16; +class hip_bfloat16; + + +namespace thrust { + + +template +class complex; + + +} +namespace std { + + +inline gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } + +inline gko::half abs(std::complex a) +{ + // Using float abs not sqrt on norm to avoid overflow + return gko::half(abs(std::complex(a))); +} + + +inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } + +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); +} + + +inline gko::bfloat16 abs(gko::bfloat16 a) +{ + return gko::bfloat16((a > 0) ? a : -a); +} + +inline gko::bfloat16 abs(std::complex a) +{ + // Using float abs not sqrt on norm to avoid overflow + return gko::bfloat16(abs(std::complex(a))); +} + + +inline gko::bfloat16 sqrt(gko::bfloat16 a) +{ + return gko::bfloat16(sqrt(float(a))); +} + +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); +} + + +} // namespace std + + namespace gko { @@ -176,12 +238,36 @@ template struct is_complex_impl> : public std::integral_constant {}; +template +struct is_complex_impl> + : public std::integral_constant {}; + template struct is_complex_or_scalar_impl : std::is_scalar {}; +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + +template <> +struct is_complex_or_scalar_impl<__half> : std::true_type {}; + +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + +template <> +struct is_complex_or_scalar_impl<__nv_bfloat16> : std::true_type {}; + +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + +template +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; + template -struct is_complex_or_scalar_impl> : std::is_scalar {}; +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; /** @@ -389,6 +475,19 @@ namespace detail { template struct next_precision_impl {}; +#if GINKGO_ENABLE_HALF +template <> +struct next_precision_impl { + using type = bfloat16; +}; + + +template <> +struct next_precision_impl { + using type = float; +}; +#endif + template <> struct next_precision_impl { using type = double; @@ -396,15 +495,33 @@ struct next_precision_impl { template <> struct next_precision_impl { +#if GINKGO_ENABLE_HALF + using type = half; +#else using type = float; +#endif }; + template struct next_precision_impl> { using type = std::complex::type>; }; +template +struct next_precision_impl2 { + using type = + typename next_precision_impl2::type, + I - 1>::type; +}; + +template +struct next_precision_impl2 { + using type = T; +}; + + template struct reduce_precision_impl { using type = T; @@ -447,11 +564,32 @@ struct increase_precision_impl { }; +template +struct arth_type { + using type = T; +}; + +template <> +struct arth_type { + using type = float; +}; + +template <> +struct arth_type { + using type = float; +}; + +template +struct arth_type> { + using type = std::complex::type>; +}; + template struct infinity_impl { // CUDA doesn't allow us to call std::numeric_limits functions // so we need to store the value instead. - static constexpr auto value = std::numeric_limits::infinity(); + static constexpr auto value = + std::numeric_limits::type>::infinity(); }; @@ -463,11 +601,87 @@ struct highest_precision_impl { using type = decltype(T1{} + T2{}); }; +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__nv_bfloat16, __half> { + using type = float; +}; + +template <> +struct highest_precision_impl<__half, __nv_bfloat16> { + using type = float; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__half, hip_bfloat16> { + using type = float; +}; + + +template <> +struct highest_precision_impl { + using type = double; +}; + +template <> +struct highest_precision_impl<__half, double> { + using type = double; +}; + +template <> +struct highest_precision_impl { + using type = double; +}; + +template <> +struct highest_precision_impl<__nv_bfloat16, double> { + using type = double; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__half, float> { + using type = float; +}; + +template <> +struct highest_precision_impl { + using type = float; +}; + +template <> +struct highest_precision_impl<__nv_bfloat16, float> { + using type = float; +}; + template struct highest_precision_impl, std::complex> { using type = std::complex::type>; }; +template +struct highest_precision_impl, thrust::complex> { + using type = thrust::complex::type>; +}; + template struct highest_precision_variadic { using type = typename highest_precision_impl< @@ -489,6 +703,9 @@ struct highest_precision_variadic { template using next_precision = typename detail::next_precision_impl::type; +template +using next_precision2 = typename detail::next_precision_impl2::type; + /** * Obtains the previous type in the singly-linked precision list. @@ -496,8 +713,30 @@ using next_precision = typename detail::next_precision_impl::type; * @note Currently our lists contains only two elements, so this is the same as * next_precision. */ +#if GINKGO_ENABLE_HALF +template +using previous_precision = next_precision>>; +#else template using previous_precision = next_precision; +#endif + +namespace detail { +template +struct previous_precision_impl2 { + using type = + typename previous_precision_impl2, I - 1>::type; +}; + +template +struct previous_precision_impl2 { + using type = T; +}; +} // namespace detail + +template +using previous_precision2 = + typename detail::previous_precision_impl2::type; /** @@ -683,7 +922,7 @@ GKO_INLINE __host__ constexpr T zero(const T&) template GKO_INLINE __host__ constexpr T one() { - return T(1); + return T(static_cast>(1.0)); } @@ -743,7 +982,7 @@ GKO_INLINE __device__ constexpr std::enable_if_t< !std::is_same>>::value, T> one() { - return T(1); + return T(static_cast>(1.0)); } @@ -802,7 +1041,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr T zero(const T&) template GKO_INLINE GKO_ATTRIBUTES constexpr T one() { - return T(1); + return T(1.0); } @@ -998,7 +1237,7 @@ template GKO_ATTRIBUTES GKO_INLINE constexpr std::enable_if_t::value, T> imag_impl(const T&) { - return T{}; + return T(0.0); } template @@ -1198,7 +1437,8 @@ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_finite(const T& value) { - constexpr T infinity{detail::infinity_impl::value}; + constexpr typename detail::arth_type::type infinity{ + detail::infinity_impl::value}; return abs(value) < infinity; } @@ -1282,13 +1522,13 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( * @return NaN. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< + !is_complex_s::value, typename detail::arth_type::type> nan() { return std::numeric_limits::quiet_NaN(); } - /** * Returns a complex with both components quiet NaN. * @@ -1297,7 +1537,8 @@ nan() * @return complex{NaN, NaN}. */ template -GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t::value, T> +GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t< + is_complex_s::value, typename detail::arth_type::type> nan() { return T{nan>(), nan>()}; diff --git a/include/ginkgo/core/base/matrix_assembly_data.hpp b/include/ginkgo/core/base/matrix_assembly_data.hpp index 3ea112094f2..5aa4419aa2d 100644 --- a/include/ginkgo/core/base/matrix_assembly_data.hpp +++ b/include/ginkgo/core/base/matrix_assembly_data.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 57ac0ad5f5b..fbe60c7f07e 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include @@ -67,7 +68,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return dist(gen); + return ValueType(dist(gen)); } diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index bf985cabeb7..5d9d68d19ee 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -117,6 +118,14 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); +#if GINKGO_ENABLE_HALF +// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 +// TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); +GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); +GKO_REGISTER_MPI_TYPE(bfloat16, MPI_UNSIGNED_SHORT); +GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); +#endif // GKO_ENABLE_HALF GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 1ddc299fed9..27714266c40 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -77,12 +77,16 @@ make_temporary_conversion(Ptr&& matrix) using Pointee = detail::pointee; using Dense = matrix::Dense; using NextDense = matrix::Dense>; + using NextNextDense = + matrix::Dense>>; + using NextNextNextDense = matrix::Dense>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; - auto result = detail::temporary_conversion< - MaybeConstDense>::template create(matrix); + auto result = + detail::temporary_conversion::template create< + NextDense, NextNextDense, NextNextNextDense>(matrix); if (!result) { - GKO_NOT_SUPPORTED(*matrix); + GKO_NOT_SUPPORTED(matrix); } return result; } @@ -255,11 +259,17 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) #ifdef GINKGO_MIXED_PRECISION using fst_type = matrix::Dense; using snd_type = matrix::Dense>; + using trd_type = matrix::Dense>>; + using fth_type = matrix::Dense>; if (auto dense_in = dynamic_cast(in)) { if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } @@ -268,6 +278,34 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } @@ -365,7 +403,10 @@ make_temporary_conversion(LinOp* matrix) auto result = detail::temporary_conversion< experimental::distributed::Vector>:: template create< - experimental::distributed::Vector>>( + experimental::distributed::Vector>, + experimental::distributed::Vector< + next_precision>>, + experimental::distributed::Vector>>( matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); @@ -384,7 +425,10 @@ make_temporary_conversion(const LinOp* matrix) auto result = detail::temporary_conversion< const experimental::distributed::Vector>:: template create< - experimental::distributed::Vector>>( + experimental::distributed::Vector>, + experimental::distributed::Vector< + next_precision>>, + experimental::distributed::Vector>>( matrix); if (!result) { GKO_NOT_SUPPORTED(matrix); diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 1e4c7a5d00e..cdd9af420ee 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp index 20934afcdf4..546d1891cd7 100644 --- a/include/ginkgo/core/base/range_accessors.hpp +++ b/include/ginkgo/core/base/range_accessors.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 68b5da6e3eb..f7da7f28075 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -46,6 +46,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + #ifdef __HIPCC__ #include #endif // __HIPCC__ @@ -156,8 +159,14 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; - +// #if defined(SYCL_LANGUAGE_VERSION) && \ +// (__LIBSYCL_MAJOR_VERSION > 5 || \ +// (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) +// using half = sycl::half; +// #else class half; +class bfloat16; +// #endif /** @@ -419,6 +428,17 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) +// cuda half operation is supported from arch 5.3 +#if GINKGO_ENABLE_HALF && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530) +#define GKO_ADAPT_HF(_macro) template _macro +#else +#define GKO_ADAPT_HF(_macro) \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") +#endif + + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -429,15 +449,23 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(_macro(bfloat16)); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(_macro(bfloat16)); \ template _macro(float); \ template _macro(double) #endif +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(_macro) \ + template _macro(float); \ + template _macro(double) + /** * Instantiates a template for each value type compiled by Ginkgo. @@ -450,16 +478,25 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(_macro); \ + template _macro(std::complex); \ + template _macro(std::complex) + /** * Instantiates a template for each value and scalar type compiled by Ginkgo. @@ -473,22 +510,34 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(_macro(std::complex, bfloat16)); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(_macro(std::complex, bfloat16)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -517,16 +566,24 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32)); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(half, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64)); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ + GKO_ADAPT_HF(_macro(half, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64)); \ template _macro(float, int64); \ template _macro(double, int64) #endif @@ -543,17 +600,25 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -571,6 +636,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ + GKO_ADAPT_HF(_macro(half, int32, int32)); \ + GKO_ADAPT_HF(_macro(half, int32, int64)); \ + GKO_ADAPT_HF(_macro(half, int64, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -583,6 +654,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ + GKO_ADAPT_HF(_macro(half, int32, int32)); \ + GKO_ADAPT_HF(_macro(half, int32, int64)); \ + GKO_ADAPT_HF(_macro(half, int64, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int32)); \ + GKO_ADAPT_HF(_macro(bfloat16, int32, int64)); \ + GKO_ADAPT_HF(_macro(bfloat16, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -604,6 +681,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -617,6 +700,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -627,24 +716,52 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template <> \ - _macro(float, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, float) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template <> \ + _macro(float, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, float) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(half, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, half) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(bfloat16, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, bfloat16) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(float, bfloat16)); \ + GKO_ADAPT_HF(_macro(bfloat16, float)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ - template <> \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template <> \ + _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED #else /** @@ -656,10 +773,30 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + GKO_ADAPT_HF(_macro(half, double)); \ + GKO_ADAPT_HF(_macro(double, half)); \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(bfloat16, double)); \ + GKO_ADAPT_HF(_macro(double, bfloat16)); \ + GKO_ADAPT_HF(_macro(float, bfloat16)); \ + GKO_ADAPT_HF(_macro(bfloat16, float)); \ + GKO_ADAPT_HF(_macro(bfloat16, half)); \ + GKO_ADAPT_HF(_macro(half, bfloat16)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -672,11 +809,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -689,12 +830,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(_macro(std::complex, bfloat16)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -707,16 +854,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(char, char); \ - template _macro(int32, int32); \ - template _macro(int64, int64); \ - template _macro(unsigned int, unsigned int); \ - template _macro(unsigned long, unsigned long); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(long double, long double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(char, char); \ + template _macro(int32, int32); \ + template _macro(int64, int64); \ + template _macro(unsigned int, unsigned int); \ + template _macro(unsigned long, unsigned long); \ + GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(_macro(bfloat16, bfloat16)); \ + template _macro(float, float); \ + template _macro(double, double); \ + template _macro(long double, long double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) /** @@ -728,8 +879,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(_macro(bfloat16)); \ template _macro(float); \ template _macro(double); \ + GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex); \ template _macro(size_type); \ diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index 3f26d5d7659..95890f4bd6d 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp index caa0cbe0761..299d8fe93f1 100644 --- a/include/ginkgo/core/base/version.hpp +++ b/include/ginkgo/core/base/version.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index da91f8f0e60..2a351c31321 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -268,10 +268,16 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>, + LocalIndexType, GlobalIndexType>>, + public ConvertibleTo, LocalIndexType, + GlobalIndexType>>, +#endif public DistributedBase { friend class EnableCreateMethod; friend class EnableDistributedPolymorphicObject; - friend class Matrix, LocalIndexType, + friend class Matrix, LocalIndexType, GlobalIndexType>; public: @@ -295,7 +301,35 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; +#if GINKGO_ENABLE_HALF + friend class Matrix>, + LocalIndexType, GlobalIndexType>; + using ConvertibleTo< + Matrix>, local_index_type, + global_index_type>>::convert_to; + using ConvertibleTo>, + local_index_type, global_index_type>>::move_to; + + void convert_to( + Matrix>, local_index_type, + global_index_type>* result) const override; + + void move_to(Matrix>, + local_index_type, global_index_type>* result) override; + + friend class Matrix, LocalIndexType, + GlobalIndexType>; + using ConvertibleTo, local_index_type, + global_index_type>>::convert_to; + using ConvertibleTo, local_index_type, + global_index_type>>::move_to; + + void convert_to(Matrix, local_index_type, + global_index_type>* result) const override; + void move_to(Matrix, local_index_type, + global_index_type>* result) override; +#endif /** * Reads a square matrix from the device_matrix_data structure and a global * partition. diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index bb36528a4a8..f3eebf68283 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 61ceab8e380..bd7778dc51c 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -88,13 +88,17 @@ class Vector : public EnableDistributedLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>>>, + public ConvertibleTo>>, +#endif public EnableAbsoluteComputation>>, public DistributedBase { friend class EnableCreateMethod; friend class EnableDistributedPolymorphicObject; friend class Vector>; friend class Vector>; - friend class Vector>; + friend class Vector>; public: using EnableDistributedLinOp::convert_to; @@ -193,6 +197,29 @@ class Vector void move_to(Vector>* result) override; +#if GINKGO_ENABLE_HALF + friend class Vector>>; + using ConvertibleTo< + Vector>>>::convert_to; + using ConvertibleTo< + Vector>>>::move_to; + + void convert_to(Vector>>* result) + const override; + + void move_to( + Vector>>* result) override; + + friend class Vector>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + Vector>* result) const override; + + void move_to(Vector>* result) override; +#endif + std::unique_ptr compute_absolute() const override; void compute_absolute_inplace() override; @@ -647,6 +674,37 @@ struct conversion_target_helper> { return target_type::create(source->get_executor(), source->get_communicator()); } + + // Allow to create_empty of the same type + // For distributed case, next> will be V in the candicated list. + // TODO: decide to whether to add this or add condition to the list + static std::unique_ptr create_empty(const target_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } + +#if GINKGO_ENABLE_HALF + using snd_source_type = experimental::distributed::Vector< + previous_precision>>; + + static std::unique_ptr create_empty( + const snd_source_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } + + using trd_source_type = + experimental::distributed::Vector>; + + static std::unique_ptr create_empty( + const trd_source_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } +#endif }; diff --git a/include/ginkgo/core/factorization/factorization.hpp b/include/ginkgo/core/factorization/factorization.hpp index 65b551c35f2..c16a65c4704 100644 --- a/include/ginkgo/core/factorization/factorization.hpp +++ b/include/ginkgo/core/factorization/factorization.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp index d3f0ac27926..a48e076d852 100644 --- a/include/ginkgo/core/factorization/ic.hpp +++ b/include/ginkgo/core/factorization/ic.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp index 98d36ee9d87..a6341f09ba8 100644 --- a/include/ginkgo/core/factorization/ilu.hpp +++ b/include/ginkgo/core/factorization/ilu.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp index 2df350f31a2..fd5abbb726b 100644 --- a/include/ginkgo/core/factorization/par_ic.hpp +++ b/include/ginkgo/core/factorization/par_ic.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp index 173136fa682..573666969ad 100644 --- a/include/ginkgo/core/factorization/par_ict.hpp +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index 878721afbd5..954b8a484cb 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp index 76f3789a44e..cc8b17c281c 100644 --- a/include/ginkgo/core/factorization/par_ilut.hpp +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index b700e1e703a..4c8a9981e10 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 15662294607..217e50980d1 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -78,6 +78,11 @@ template class Coo : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Coo>, IndexType>>, + public ConvertibleTo, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -110,13 +115,39 @@ class Coo : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Coo, IndexType>; + friend class Coo, IndexType>; void convert_to( Coo, IndexType>* result) const override; void move_to(Coo, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Coo>, + IndexType>; + using ConvertibleTo< + Coo>, IndexType>>::convert_to; + using ConvertibleTo< + Coo>, IndexType>>::move_to; + + void convert_to(Coo>, IndexType>* + result) const override; + + void move_to(Coo>, IndexType>* + result) override; + + friend class Coo, IndexType>; + using ConvertibleTo< + Coo, IndexType>>::convert_to; + using ConvertibleTo, IndexType>>::move_to; + + void convert_to( + Coo, IndexType>* result) const override; + + void move_to( + Coo, IndexType>* result) override; +#endif + void convert_to(Csr* other) const override; void move_to(Csr* other) override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 611e5d33c64..28ebc31f05e 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -127,6 +127,11 @@ template class Csr : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Csr>, IndexType>>, + public ConvertibleTo, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -716,13 +721,40 @@ class Csr : public EnableLinOp>, index_type max_length_per_row_; }; - friend class Csr, IndexType>; + friend class Csr, IndexType>; void convert_to( Csr, IndexType>* result) const override; void move_to(Csr, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Csr>, + IndexType>; + using ConvertibleTo< + Csr>, IndexType>>::convert_to; + using ConvertibleTo< + Csr>, IndexType>>::move_to; + + void convert_to(Csr>, IndexType>* + result) const override; + + void move_to(Csr>, IndexType>* + result) override; + + + friend class Csr, IndexType>; + using ConvertibleTo< + Csr, IndexType>>::convert_to; + using ConvertibleTo, IndexType>>::move_to; + + void convert_to( + Csr, IndexType>* result) const override; + + void move_to( + Csr, IndexType>* result) override; +#endif + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index ae738d49b93..374b00b9eb5 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include @@ -108,6 +109,10 @@ class Dense : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>>>, + public ConvertibleTo>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -296,12 +301,35 @@ class Dense return other->create_const_view_of_impl(); } - friend class Dense>; + friend class Dense>; void convert_to(Dense>* result) const override; void move_to(Dense>* result) override; +#if GINKGO_ENABLE_HALF + friend class Dense>>; + using ConvertibleTo< + Dense>>>::convert_to; + using ConvertibleTo< + Dense>>>::move_to; + + void convert_to(Dense>>* result) + const override; + + void move_to( + Dense>>* result) override; + + friend class Dense>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + Dense>* result) const override; + + void move_to(Dense>* result) override; +#endif + void convert_to(Coo* result) const override; void move_to(Coo* result) override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 50febffcfad..de34978b309 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -71,6 +71,10 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>>>, + public ConvertibleTo>>, +#endif public Transposable, public WritableToMatrixData, public WritableToMatrixData, @@ -101,7 +105,7 @@ class Diagonal using device_mat_data32 = device_matrix_data; using absolute_type = remove_complex; - friend class Diagonal>; + friend class Diagonal>; std::unique_ptr transpose() const override; @@ -111,6 +115,29 @@ class Diagonal void move_to(Diagonal>* result) override; +#if GINKGO_ENABLE_HALF + friend class Diagonal>>; + using ConvertibleTo< + Diagonal>>>::convert_to; + using ConvertibleTo< + Diagonal>>>::move_to; + + void convert_to(Diagonal>>* result) + const override; + + void move_to( + Diagonal>>* result) override; + + friend class Diagonal>; + using ConvertibleTo>>::convert_to; + using ConvertibleTo>>::move_to; + + void convert_to( + Diagonal>* result) const override; + + void move_to(Diagonal>* result) override; +#endif + void convert_to(Csr* result) const override; void move_to(Csr* result) override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index afa19f49407..5e948b3440c 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -80,6 +80,11 @@ template class Ell : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Ell>, IndexType>>, + public ConvertibleTo, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -93,7 +98,7 @@ class Ell : public EnableLinOp>, friend class Coo; friend class Csr; friend class Ell, IndexType>; - friend class Ell, IndexType>; + friend class Ell, IndexType>; friend class Hybrid; public: @@ -118,6 +123,32 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Ell>, + IndexType>; + using ConvertibleTo< + Ell>, IndexType>>::convert_to; + using ConvertibleTo< + Ell>, IndexType>>::move_to; + + void convert_to(Ell>, IndexType>* + result) const override; + + void move_to(Ell>, IndexType>* + result) override; + + friend class Ell, IndexType>; + using ConvertibleTo< + Ell, IndexType>>::convert_to; + using ConvertibleTo, IndexType>>::move_to; + + void convert_to( + Ell, IndexType>* result) const override; + + void move_to( + Ell, IndexType>* result) override; +#endif + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index b8833d59b7f..e9d019cec5f 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -124,18 +124,24 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size) * @ingroup LinOp */ template -class Fbcsr : public EnableLinOp>, - public EnableCreateMethod>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public Transposable, - public EnableAbsoluteComputation< - remove_complex>> { +class Fbcsr + : public EnableLinOp>, + public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Fbcsr>, IndexType>>, + public ConvertibleTo, IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public Transposable, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Csr; @@ -175,13 +181,40 @@ class Fbcsr : public EnableLinOp>, using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; - friend class Fbcsr, IndexType>; + friend class Fbcsr, IndexType>; void convert_to( Fbcsr, IndexType>* result) const override; void move_to(Fbcsr, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Fbcsr>, + IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr>, IndexType>>::move_to; + + void convert_to(Fbcsr>, IndexType>* + result) const override; + + void move_to(Fbcsr>, IndexType>* + result) override; + + friend class Fbcsr, IndexType>; + using ConvertibleTo< + Fbcsr, IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr, IndexType>>::move_to; + + void convert_to( + Fbcsr, IndexType>* result) const override; + + void move_to( + Fbcsr, IndexType>* result) override; +#endif + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index db65b57b6fb..6c17004d3a5 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -72,6 +72,11 @@ class Hybrid : public EnableLinOp>, public EnableCreateMethod>, public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Hybrid>, IndexType>>, + public ConvertibleTo, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -386,13 +391,40 @@ class Hybrid imbalance_bounded_limit strategy_; }; - friend class Hybrid, IndexType>; + friend class Hybrid, IndexType>; void convert_to( Hybrid, IndexType>* result) const override; void move_to(Hybrid, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Hybrid>, + IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Hybrid>, IndexType>>::move_to; + + void convert_to(Hybrid>, + IndexType>* result) const override; + + void move_to(Hybrid>, IndexType>* + result) override; + + friend class Hybrid, IndexType>; + using ConvertibleTo< + Hybrid, IndexType>>::convert_to; + using ConvertibleTo< + Hybrid, IndexType>>::move_to; + + void convert_to(Hybrid, IndexType>* result) + const override; + + void move_to( + Hybrid, IndexType>* result) override; +#endif + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index 163160a2af6..0ccc24004ee 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/matrix/row_gatherer.hpp b/include/ginkgo/core/matrix/row_gatherer.hpp index 3baedce4806..4317dfee51b 100644 --- a/include/ginkgo/core/matrix/row_gatherer.hpp +++ b/include/ginkgo/core/matrix/row_gatherer.hpp @@ -44,6 +44,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 70656152e27..2e7f93cff4b 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -68,16 +68,22 @@ class Csr; * @ingroup LinOp */ template -class Sellp : public EnableLinOp>, - public EnableCreateMethod>, - public ConvertibleTo, IndexType>>, - public ConvertibleTo>, - public ConvertibleTo>, - public DiagonalExtractable, - public ReadableFromMatrixData, - public WritableToMatrixData, - public EnableAbsoluteComputation< - remove_complex>> { +class Sellp + : public EnableLinOp>, + public EnableCreateMethod>, + public ConvertibleTo, IndexType>>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Sellp>, IndexType>>, + public ConvertibleTo, IndexType>>, +#endif + public ConvertibleTo>, + public ConvertibleTo>, + public DiagonalExtractable, + public ReadableFromMatrixData, + public WritableToMatrixData, + public EnableAbsoluteComputation< + remove_complex>> { friend class EnableCreateMethod; friend class EnablePolymorphicObject; friend class Dense; @@ -102,13 +108,40 @@ class Sellp : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Sellp, IndexType>; + friend class Sellp, IndexType>; void convert_to( Sellp, IndexType>* result) const override; void move_to(Sellp, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Sellp>, + IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Sellp>, IndexType>>::move_to; + + void convert_to(Sellp>, IndexType>* + result) const override; + + void move_to(Sellp>, IndexType>* + result) override; + + friend class Sellp, IndexType>; + using ConvertibleTo< + Sellp, IndexType>>::convert_to; + using ConvertibleTo< + Sellp, IndexType>>::move_to; + + void convert_to( + Sellp, IndexType>* result) const override; + + void move_to( + Sellp, IndexType>* result) override; +#endif + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp index 3c5c3998536..c168e973a24 100644 --- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp +++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp index a90507ce740..16d77aa2e11 100644 --- a/include/ginkgo/core/multigrid/pgm.hpp +++ b/include/ginkgo/core/multigrid/pgm.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 7db9d19c7c2..19a1759ef0d 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -388,7 +388,8 @@ class Ilu : public EnableLinOp< generate_default_solver(const std::shared_ptr& exec, const std::shared_ptr& mtx) { - constexpr gko::remove_complex default_reduce_residual{1e-4}; + // half can not use constexpr constructor + const gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index 72ba6827f2b..5ffe80b8524 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/reorder/scaled_reordered.hpp b/include/ginkgo/core/reorder/scaled_reordered.hpp index 3c4f6efbbd7..9269106eb07 100644 --- a/include/ginkgo/core/reorder/scaled_reordered.hpp +++ b/include/ginkgo/core/reorder/scaled_reordered.hpp @@ -36,6 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index c7b47a0e807..3bb1a69e350 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index 214e669b2ff..eef2e454698 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp index a2dbb1efce1..9cf6c3913ae 100644 --- a/include/ginkgo/core/solver/cb_gmres.hpp +++ b/include/ginkgo/core/solver/cb_gmres.hpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index bc0861cf270..b57abe73467 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index 22f81d8a292..57f9c8a9735 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index cad7a29fc27..e13529eb38c 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index d7d0f57a8a4..95dbdba0d1d 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index fc677f33171..5ab0cb17c3f 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index c5c69c1fb67..85ff3e970a6 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 2d04a889445..cd4a3ed6f9a 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp index a05c8d62b84..96a8c58b040 100644 --- a/include/ginkgo/core/solver/triangular.hpp +++ b/include/ginkgo/core/solver/triangular.hpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp index ee7d7890cf4..addc06b3fbb 100644 --- a/include/ginkgo/core/stop/stopping_status.hpp +++ b/include/ginkgo/core/stop/stopping_status.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index bcdaa5d2d20..40502df13a1 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -51,6 +51,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include #include diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 47259feeac0..50925c106bc 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -75,7 +75,6 @@ target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}") separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${GINKGO_COMPILER_FLAGS}") - # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda) # Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 9ff4cee376a..90c754907d1 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include @@ -61,10 +62,75 @@ void atomic_add(ValueType& out, ValueType val) // The C++ standard explicitly allows casting complex* to double* // [complex.numbers.general] auto values = reinterpret_cast*>(&out); -#pragma omp atomic - values[0] += real(val); -#pragma omp atomic - values[1] += imag(val); + atomic_add(values[0], real(val)); + atomic_add(values[1], imag(val)); +} + + +template +inline ResultType reinterpret(ValueType val) +{ + static_assert(sizeof(ValueType) == sizeof(ResultType), + "The type to reinterpret to must be of the same size as the " + "original type."); + return reinterpret_cast(val); +} + + +template <> +void atomic_add(half& out, half val) +{ +#ifdef __NVCOMPILER +// NVC++ uses atomic capture on uint16 leads the following error. +// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !* +#pragma omp critical + { + out += val; + } +#else + // UB? + uint16_t* address_as_converter = reinterpret_cast(&out); + uint16_t old = *address_as_converter; + uint16_t assumed; + do { + assumed = old; + auto answer = reinterpret(reinterpret(assumed) + val); +#pragma omp atomic capture + { + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; + } + } while (assumed != old); +#endif +} + + +template <> +void atomic_add(bfloat16& out, bfloat16 val) +{ +#ifdef __NVCOMPILER +// NVC++ uses atomic capture on uint16 leads the following error. +// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !* +#pragma omp critical + { + out += val; + } +#else + // UB? + uint16_t* address_as_converter = reinterpret_cast(&out); + uint16_t old = *address_as_converter; + uint16_t assumed; + do { + assumed = old; + auto answer = + reinterpret(reinterpret(assumed) + val); +#pragma omp atomic capture + { + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; + } + } while (assumed != old); +#endif } diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index b2c443635e8..90f0a243e19 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -213,7 +213,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // TODO FIXME: NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. + // https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto total_histogram = reinterpret_cast(sample + bucket_count); diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 7d4a5a7ebd1..73a2d404de3 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -127,8 +127,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + arithmetic_type valpha = static_cast(alpha->at(0, 0)); + arithmetic_type vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp index 2e9f30f3860..beb15a7176c 100644 --- a/omp/matrix/fft_kernels.cpp +++ b/omp/matrix/fft_kernels.cpp @@ -149,7 +149,7 @@ void fft(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -220,7 +220,8 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -325,7 +326,8 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index e8fa36556ba..9b5df4b1782 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -361,7 +361,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index 8f4acf8a747..465912b7b1c 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -167,7 +167,8 @@ void initialize(std::shared_ptr exec, const size_type nrhs, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution< + typename detail::arth_type>::type>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index dd54e3fb52f..c445d9dad03 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -70,7 +70,6 @@ target_compile_options(ginkgo_reference PRIVATE "${GINKGO_COMPILER_FLAGS}") if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") set_source_files_properties(preconditioner/jacobi_kernels.cpp PROPERTIES COMPILE_FLAGS "-O1") endif() - if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_reference "") endif() diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index f26da021681..83ada9201ea 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -222,7 +222,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // TODO FIXME: NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. + // https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto histogram = reinterpret_cast(sample + bucket_count); diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index 3a05a09cd45..60c71357abd 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -124,8 +124,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + arithmetic_type valpha = static_cast(alpha->at(0, 0)); + arithmetic_type vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 6a78490af9c..b4de6c418f4 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -137,7 +137,8 @@ void advanced_spmv(std::shared_ptr exec, for (size_type j = 0; j < c->get_size()[1]; j++) { for (size_type row = 0; row < a->get_size()[0]; row++) { - arithmetic_type result = c->at(row, j); + arithmetic_type result = + static_cast(c->at(row, j)); result *= beta_val; for (size_type i = 0; i < num_stored_elements_per_row; i++) { arithmetic_type val = a_vals(row + i * stride); diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp index bdf056cf882..8a79f72f5f1 100644 --- a/reference/matrix/fft_kernels.cpp +++ b/reference/matrix/fft_kernels.cpp @@ -146,7 +146,7 @@ void fft(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -213,7 +213,8 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -313,7 +314,8 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index 2df07cf9258..b24ee03f11b 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -325,7 +325,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index 15a95ae0711..f9604a55313 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -152,7 +152,9 @@ void initialize(std::shared_ptr exec, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution< + typename ::gko::detail::arth_type>::type>( + 0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index 4f922c37703..21136a172ae 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -354,7 +354,7 @@ TYPED_TEST(MultiVector, ConvertsToPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->mtx_1->convert_to(tmp.get()); tmp->convert_to(res.get()); @@ -377,7 +377,7 @@ TYPED_TEST(MultiVector, MovesToPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->mtx_1->move_to(tmp.get()); tmp->move_to(res.get()); diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp index 0789b446d23..1c6736c289b 100644 --- a/reference/test/base/combination.cpp +++ b/reference/test/base/combination.cpp @@ -147,7 +147,7 @@ TYPED_TEST(Combination, AppliesToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -189,7 +189,7 @@ TYPED_TEST(Combination, AppliesToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::to_complex>; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -233,7 +233,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -281,7 +281,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmb = gko::Combination::create( diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp index 0b89606dd9d..019a2eb4cbf 100644 --- a/reference/test/base/composition.cpp +++ b/reference/test/base/composition.cpp @@ -175,7 +175,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize({1.0, 2.0}, this->exec); @@ -215,7 +215,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision>; + using value_type = next_precision>; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize( @@ -255,7 +255,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto alpha = gko::initialize({3.0}, this->exec); @@ -300,7 +300,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Composition::create(this->product); diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp index 45483112b7c..b82d1485f93 100644 --- a/reference/test/base/perturbation.cpp +++ b/reference/test/base/perturbation.cpp @@ -134,7 +134,7 @@ TYPED_TEST(Perturbation, AppliesToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -176,7 +176,7 @@ TYPED_TEST(Perturbation, AppliesToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::to_complex>; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -218,7 +218,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -265,7 +265,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp index 5cde9f132d3..9c6934dcb27 100644 --- a/reference/test/factorization/lu_kernels.cpp +++ b/reference/test/factorization/lu_kernels.cpp @@ -238,7 +238,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks) diag_idxs.get_const_data(), this->mtx_lu.get(), tmp); GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref, - 15 * r::value); + 30 * r::value); }); } @@ -284,7 +284,7 @@ TYPED_TEST(Lu, FactorizeNonsymmetricWorks) GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); @@ -311,7 +311,7 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks) auto lu = factory->generate(this->mtx); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp index 9da285ec3eb..3227e33cce6 100644 --- a/reference/test/factorization/par_ilut_kernels.cpp +++ b/reference/test/factorization/par_ilut_kernels.cpp @@ -86,6 +86,7 @@ class ParIlut : public ::testing::Test { using ComplexCsr = gko::matrix::Csr>, index_type>; + using complex_value_type = std::complex>; ParIlut() : ref(gko::ReferenceExecutor::create()), @@ -107,16 +108,24 @@ class ParIlut : public ::testing::Test { {0., -3., 0., 1.}}, ref)), mtx1_complex(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., .1}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{-1., .1}, complex_value_type{.1, 2.}}}, ref)), mtx1_expect_complex_thrm(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{0., 0.}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{0., 0.}, complex_value_type{.1, 2.}}}, ref)), identity(gko::initialize( {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)), diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 375486cd72f..5c606dcae16 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -64,7 +64,7 @@ class Coo : public ::testing::Test { using Csr = gko::matrix::Csr; using Mtx = gko::matrix::Coo; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec)) { @@ -111,7 +111,7 @@ TYPED_TEST(Coo, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -119,7 +119,9 @@ TYPED_TEST(Coo, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -132,7 +134,7 @@ TYPED_TEST(Coo, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -140,11 +142,15 @@ TYPED_TEST(Coo, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->move_to(tmp); tmp->move_to(res); + // TODO: When use move_to to the different precision, it will keep the + // original data GKO_ASSERT_MTX_NEAR(this->mtx, res, residual); } @@ -244,7 +250,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -261,7 +267,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -731,8 +737,7 @@ TYPED_TEST(Coo, AppliesToComplex) TYPED_TEST(Coo, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -787,8 +792,7 @@ TYPED_TEST(Coo, AdvancedAppliesToComplex) TYPED_TEST(Coo, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -845,8 +849,7 @@ TYPED_TEST(Coo, ApplyAddsToComplex) TYPED_TEST(Coo, ApplyAddsToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedVec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -902,8 +905,7 @@ TYPED_TEST(Coo, ApplyAddsScaledToComplex) TYPED_TEST(Coo, ApplyAddsScaledToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index d56201ade02..f9f58b4bf28 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -75,7 +75,7 @@ class Csr : public ::testing::Test { using Ell = gko::matrix::Ell; using Hybrid = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Csr() : exec(gko::ReferenceExecutor::create()), @@ -793,7 +793,7 @@ TYPED_TEST(Csr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -801,7 +801,9 @@ TYPED_TEST(Csr, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; // use mtx2 as mtx's strategy would involve creating a CudaExecutor this->mtx2->convert_to(tmp); @@ -818,7 +820,7 @@ TYPED_TEST(Csr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -826,7 +828,9 @@ TYPED_TEST(Csr, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; // use mtx2 as mtx's strategy would involve creating a CudaExecutor this->mtx2->move_to(tmp); @@ -995,7 +999,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1014,7 +1018,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1607,8 +1611,7 @@ TYPED_TEST(Csr, AppliesToComplex) TYPED_TEST(Csr, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -1663,8 +1666,7 @@ TYPED_TEST(Csr, AdvancedAppliesToComplex) TYPED_TEST(Csr, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 9edab89e382..13142e0e0d4 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -66,7 +66,7 @@ class Dense : public ::testing::Test { protected: using value_type = T; using Mtx = gko::matrix::Dense; - using MixedMtx = gko::matrix::Dense>; + using MixedMtx = gko::matrix::Dense>; using ComplexMtx = gko::to_complex; using MixedComplexMtx = gko::to_complex; using RealMtx = gko::remove_complex; @@ -106,8 +106,7 @@ class Dense : public ::testing::Test { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(num_cols, num_cols), - std::normal_distribution>(0.0, 1.0), - rand_engine, exec); + std::normal_distribution<>(0.0, 1.0), rand_engine, exec); } }; @@ -744,14 +743,16 @@ TYPED_TEST(Dense, ConvertsToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -764,14 +765,16 @@ TYPED_TEST(Dense, MovesToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); @@ -1882,7 +1885,7 @@ TYPED_TEST(Dense, ConvertsEmptyToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto empty = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -1897,7 +1900,7 @@ TYPED_TEST(Dense, MovesEmptyToPrecision) { using Dense = typename TestFixture::Mtx; using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDense = typename gko::matrix::Dense; auto empty = OtherDense::create(this->exec); auto res = Dense::create(this->exec); @@ -3588,8 +3591,7 @@ TYPED_TEST(Dense, AppliesToComplex) TYPED_TEST(Dense, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -3642,8 +3644,7 @@ TYPED_TEST(Dense, AdvancedAppliesToComplex) TYPED_TEST(Dense, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index f8803916363..6ee00be3e47 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -62,7 +62,7 @@ class Diagonal : public ::testing::Test { using Csr = gko::matrix::Csr; using Diag = gko::matrix::Diagonal; using Dense = gko::matrix::Dense; - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; Diagonal() : exec(gko::ReferenceExecutor::create()), @@ -117,7 +117,7 @@ TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator); TYPED_TEST(Diagonal, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -125,7 +125,9 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->convert_to(tmp); tmp->convert_to(res); @@ -137,7 +139,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) TYPED_TEST(Diagonal, MovesToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -145,7 +147,9 @@ TYPED_TEST(Diagonal, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->move_to(tmp); tmp->move_to(res); @@ -602,8 +606,7 @@ TYPED_TEST(Diagonal, AppliesToComplex) TYPED_TEST(Diagonal, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -662,8 +665,7 @@ TYPED_TEST(Diagonal, AppliesLinearCombinationToComplex) TYPED_TEST(Diagonal, AppliesLinearCombinationToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; using Scalar = gko::matrix::Dense; diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 135607230a5..0c6103916a3 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -63,7 +63,7 @@ class Ell : public ::testing::Test { using Mtx = gko::matrix::Ell; using Csr = gko::matrix::Csr; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Ell() : exec(gko::ReferenceExecutor::create()), @@ -124,7 +124,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = typename gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -139,7 +139,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); @@ -155,9 +155,9 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; - using Vec2 = gko::matrix::Dense>; + using Vec2 = gko::matrix::Dense>; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); @@ -193,7 +193,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; // clang-format off auto x = gko::initialize( @@ -217,7 +217,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -242,7 +242,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -281,7 +281,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -298,7 +298,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -316,7 +316,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -360,7 +360,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -388,7 +388,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -417,7 +417,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -476,7 +476,7 @@ TYPED_TEST(Ell, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -484,7 +484,9 @@ TYPED_TEST(Ell, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -497,7 +499,7 @@ TYPED_TEST(Ell, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -505,7 +507,9 @@ TYPED_TEST(Ell, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); @@ -765,7 +769,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -782,7 +786,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -926,8 +930,7 @@ TYPED_TEST(Ell, AppliesToComplex) TYPED_TEST(Ell, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -983,8 +986,7 @@ TYPED_TEST(Ell, AdvancedAppliesToComplex) TYPED_TEST(Ell, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index e5b948df11a..260796c197c 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -146,7 +146,8 @@ std::unique_ptr> get_some_vectors( { using RT = gko::remove_complex; std::default_random_engine engine(39); - std::normal_distribution dist(0.0, 5.0); + std::normal_distribution::type> dist( + 0.0, 5.0); std::uniform_int_distribution<> nnzdist(1, nrhs); return gko::test::generate_random_matrix>( nrows, nrhs, nnzdist, dist, engine, exec); @@ -303,7 +304,7 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -311,7 +312,9 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -324,7 +327,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -332,7 +335,9 @@ TYPED_TEST(Fbcsr, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->move_to(tmp); tmp->move_to(res); @@ -421,7 +426,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); @@ -440,7 +445,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); diff --git a/reference/test/matrix/fft_kernels.cpp b/reference/test/matrix/fft_kernels.cpp index ba75582c6af..7cde806bed7 100644 --- a/reference/test/matrix/fft_kernels.cpp +++ b/reference/test/matrix/fft_kernels.cpp @@ -181,7 +181,8 @@ class Fft : public ::testing::Test { std::unique_ptr dense_ifft3; }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, + TypenameNameGenerator); TYPED_TEST(Fft, ThrowsOnNonPowerOfTwo1D) diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index c234fe0179b..9573670ba81 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -64,7 +64,7 @@ class Hybrid : public ::testing::Test { using Mtx = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; using Csr = gko::matrix::Csr; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Hybrid() : exec(gko::ReferenceExecutor::create()), @@ -265,7 +265,7 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -273,7 +273,9 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -286,7 +288,7 @@ TYPED_TEST(Hybrid, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -294,7 +296,9 @@ TYPED_TEST(Hybrid, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); @@ -396,7 +400,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -413,7 +417,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -727,8 +731,7 @@ TYPED_TEST(Hybrid, AppliesToComplex) TYPED_TEST(Hybrid, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -784,8 +787,7 @@ TYPED_TEST(Hybrid, AdvancedAppliesToComplex) TYPED_TEST(Hybrid, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp index 6836c0707ed..a406da16b18 100644 --- a/reference/test/matrix/identity.cpp +++ b/reference/test/matrix/identity.cpp @@ -51,7 +51,7 @@ class Identity : public ::testing::Test { using value_type = T; using Id = gko::matrix::Identity; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; using ComplexVec = gko::to_complex; using MixedComplexVec = gko::to_complex; diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index a5697fd1ce9..7a85e6c46a6 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -98,7 +98,7 @@ TYPED_TEST(Sellp, AppliesToDenseVector) TYPED_TEST(Sellp, AppliesToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -147,7 +147,7 @@ TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector) TYPED_TEST(Sellp, AppliesLinearCombinationToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -220,7 +220,7 @@ TYPED_TEST(Sellp, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -228,7 +228,9 @@ TYPED_TEST(Sellp, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -241,7 +243,7 @@ TYPED_TEST(Sellp, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -249,7 +251,9 @@ TYPED_TEST(Sellp, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); @@ -337,7 +341,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); @@ -356,7 +360,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index dde558d27fd..fa0fc76c947 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -177,7 +177,7 @@ TYPED_TEST(SparsityCsr, AppliesToDenseVector) TYPED_TEST(SparsityCsr, AppliesToMixedDenseVector) { - using T = gko::next_precision; + using T = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -224,7 +224,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedDenseVector) { - using T = gko::next_precision; + using T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -275,8 +275,7 @@ TYPED_TEST(SparsityCsr, AppliesToComplex) TYPED_TEST(SparsityCsr, AppliesToMixedComplex) { - using T = - gko::next_precision>; + using T = next_precision>; using Vec = gko::matrix::Dense; auto x = gko::initialize({T{2.0, 4.0}, T{1.0, 2.0}, T{4.0, 8.0}}, this->exec); @@ -310,8 +309,8 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToComplex) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedComplex) { - using Vec = gko::matrix::Dense< - gko::next_precision>; + using Vec = + gko::matrix::Dense>; using ComplexVec = gko::to_complex; using T = typename ComplexVec::value_type; auto alpha = gko::initialize({-1.0}, this->exec); diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp index b3f7348adde..aae3f577492 100644 --- a/reference/test/preconditioner/ic.cpp +++ b/reference/test/preconditioner/ic.cpp @@ -278,7 +278,7 @@ TYPED_TEST(Ic, SolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Vec::create(this->exec, gko::dim<2>{3, 1}); auto preconditioner = @@ -312,7 +312,7 @@ TYPED_TEST(Ic, SolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using Vec = gko::matrix::Dense< - gko::next_precision>>; + next_precision>>; using T = typename Vec::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -348,7 +348,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); @@ -387,8 +387,8 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplex) TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; - using MixedDense = gko::matrix::Dense< - gko::next_precision>; + using MixedDense = + gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using T = typename MixedDenseComplex::value_type; const auto b = gko::initialize( diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index ce3ea72725f..b9ab9683134 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -356,8 +356,8 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedMtx) { - using Mtx = gko::matrix::Dense< - gko::next_precision>; + using Mtx = + gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b); @@ -391,7 +391,7 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithComplexMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedComplexMtx) { using Mtx = gko::matrix::Dense< - gko::to_complex>>; + gko::to_complex>>; using T = typename Mtx::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -444,7 +444,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhs) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; const value_type alpha{2.0}; const auto alpha_linop = gko::initialize({alpha}, this->exec); @@ -494,7 +494,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhsComplex) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixedComplex) { - using value_type = gko::next_precision; + using value_type = next_precision; using complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp index eea171d60fe..b9d3eb9bcff 100644 --- a/reference/test/preconditioner/isai_kernels.cpp +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -1013,6 +1013,8 @@ TYPED_TEST(Isai, ReturnsCorrectInverseA) TYPED_TEST(Isai, ReturnsCorrectInverseALongrow) { using value_type = typename TestFixture::value_type; + // TODO: figure out whether relaxed residual norm works in half or not. + SKIP_IF_HALF(value_type); const auto isai = this->general_isai_factory->generate(this->a_csr_longrow); auto a_inv = isai->get_approximate_inverse(); @@ -1029,6 +1031,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseALongrowWithExcessSolver) { using value_type = typename TestFixture::value_type; using GeneralIsai = typename TestFixture::GeneralIsai; + SKIP_IF_HALF(value_type); auto general_isai_factory = GeneralIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1076,6 +1079,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseLLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using LowerIsai = typename TestFixture::LowerIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto lower_isai_factory = LowerIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1123,6 +1127,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseULongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using UpperIsai = typename TestFixture::UpperIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto upper_isai_factory = UpperIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1223,6 +1228,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrow) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); const auto isai = this->spd_isai_factory->generate(this->spd_csr_longrow); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); @@ -1246,6 +1252,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using SpdIsai = typename TestFixture::SpdIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); auto spd_isai_factory = diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 7fb7d85558c..571cc73133a 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -510,7 +510,7 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using next_type = gko::next_precision; + using next_type = next_precision; using Bj = typename TestFixture::Bj; auto csr = gko::share(gko::matrix::Csr::create(this->exec)); diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index 679a56030da..4f755c35e0e 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -594,11 +594,14 @@ TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions) auto prec = bj->get_parameters().storage_optimization.block_wise.get_const_data(); - auto precision2 = std::is_same, float>::value - ? gko::precision_reduction(0, 0) // float - : gko::precision_reduction(0, 1); // double - EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); // u * cond = ~1.2e-3 - ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 + auto precision1 = std::is_same, gko::half>::value + ? gko::precision_reduction(2, 0) + : gko::precision_reduction(0, 2); + auto precision2 = std::is_same, double>::value + ? gko::precision_reduction(0, 1) // double + : gko::precision_reduction(0, 0); // float, half + EXPECT_EQ(prec[0], precision1); // u * cond = ~1.2e-3 + ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 } @@ -639,6 +642,9 @@ TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow) auto precision = std::is_same, float>::value ? gko::precision_reduction(0, 2) // float : gko::precision_reduction(1, 1); // double + if (std::is_same, gko::half>::value) { + precision = gko::precision_reduction(2, 0); + } EXPECT_EQ(prec[0], precision); ASSERT_EQ(prec[1], precision); } @@ -675,7 +681,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector) TYPED_TEST(Jacobi, AppliesToMixedVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -716,7 +722,7 @@ TYPED_TEST(Jacobi, AppliesToComplexVector) TYPED_TEST(Jacobi, AppliesToMixedComplexVector) { using value_type = - gko::to_complex>; + gko::to_complex>; using Vec = gko::matrix::Dense; auto x = gko::initialize( {value_type{1.0, 2.0}, value_type{-1.0, -2.0}, value_type{2.0, 4.0}, @@ -921,7 +927,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -964,7 +970,7 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToComplexVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedComplexVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; using T = gko::to_complex; diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index 8789ded37ca..3ebcaaf5506 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -396,6 +396,8 @@ TYPED_TEST(ScaledReordered, AppliesWithRcmReordering) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithOnlyInnerOperator) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build().with_inner_operator(this->solver_factory).on(this->exec); auto scaled_reordered = scaled_reordered_fact->generate(this->rcm_mtx); @@ -442,6 +444,8 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithColScaling) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithRcmReordering) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build() .with_reordering(this->rcm_factory) .with_inner_operator(this->solver_factory) @@ -477,7 +481,8 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; + SKIP_IF_HALF(T); auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -499,6 +504,8 @@ TYPED_TEST(ScaledReordered, AdvancedSolvesSingleRhsWithScalingAndRcmReordering) { using SR = typename TestFixture::SR; using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + SKIP_IF_HALF(T); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); auto scaled_reordered_fact = SR::build() @@ -521,8 +528,9 @@ TYPED_TEST(ScaledReordered, { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; + SKIP_IF_HALF(T); auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index e317677b2de..63c5c4a1704 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -307,7 +307,7 @@ TYPED_TEST(Bicg, SolvesStencilSystem) TYPED_TEST(Bicg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -344,7 +344,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemComplex) TYPED_TEST(Bicg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize( @@ -399,7 +399,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -440,8 +440,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicg_factory->generate(this->mtx); @@ -487,6 +487,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -504,6 +505,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -521,6 +523,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystemImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -552,6 +555,7 @@ TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index ec44b6b6f17..bdc7ce58516 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -422,7 +422,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystem) TYPED_TEST(Bicgstab, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -459,7 +459,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize( @@ -528,13 +528,13 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) solver->apply(alpha, b, beta, x); - GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), r::value); + GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 2 * r::value); } TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -545,7 +545,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), - (r_mixed())); + (2 * r_mixed())); } @@ -561,22 +561,22 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), r::value); } TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicgstab_factory->generate(this->mtx); @@ -586,14 +586,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), (r_mixed())); } @@ -624,6 +624,8 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -652,6 +654,8 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -681,6 +685,7 @@ TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_HALF(value_type); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index 1127d7caff7..0b3580163b1 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -203,7 +203,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystem) TYPED_TEST(CbGmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -242,7 +242,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemComplex) TYPED_TEST(CbGmres, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto b = @@ -327,7 +327,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -370,8 +370,8 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cb_gmres_factory->generate(this->mtx); diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 76b8cf55946..b97fe563a30 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -268,7 +268,7 @@ TYPED_TEST(Cg, SolvesStencilSystem) TYPED_TEST(Cg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -305,7 +305,7 @@ TYPED_TEST(Cg, SolvesStencilSystemComplex) TYPED_TEST(Cg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize( @@ -360,7 +360,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -401,8 +401,8 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cg_factory->generate(this->mtx); @@ -448,6 +448,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -465,6 +466,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -482,6 +484,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem3) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -499,6 +502,7 @@ TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -567,6 +571,7 @@ TYPED_TEST(Cg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -584,6 +589,7 @@ TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 9c3ce2071a7..4a3f5001a1d 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -320,6 +320,7 @@ TYPED_TEST(Cgs, SolvesDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_BFLOAT16(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -332,8 +333,9 @@ TYPED_TEST(Cgs, SolvesDenseSystem) TYPED_TEST(Cgs, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; + SKIP_IF_BFLOAT16(typename TestFixture::value_type); auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -369,7 +371,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemComplex) TYPED_TEST(Cgs, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize( @@ -393,6 +395,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystem) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize( @@ -425,7 +428,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -445,6 +448,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) using Scalar = typename TestFixture::Mtx; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; + // different initial guess leads complex divergent. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -466,10 +471,12 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; + // different initial guess leads complex divergent. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -494,6 +501,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystemsUsingAdvancedApply) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_BFLOAT16(value_type); auto half_tol = std::sqrt(r::value); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -514,6 +522,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -530,6 +539,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -546,6 +556,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -562,6 +573,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -628,6 +640,7 @@ TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -644,6 +657,7 @@ TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp index 617015bac1f..fab146135f5 100644 --- a/reference/test/solver/direct.cpp +++ b/reference/test/solver/direct.cpp @@ -81,7 +81,7 @@ class Direct : public ::testing::Test { .on(exec)) .on(exec); solver = factory->generate(mtx); - std::normal_distribution> dist(0, 1); + std::normal_distribution<> dist(0, 1); x = gko::test::generate_random_dense_matrix( mtx->get_size()[0], nrhs, dist, rng, this->exec); x_ref = x->clone(); diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index e8163752689..e5803bfdc22 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -281,7 +281,7 @@ TYPED_TEST(Fcg, SolvesStencilSystem) TYPED_TEST(Fcg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -318,7 +318,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemComplex) TYPED_TEST(Fcg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize( @@ -373,7 +373,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -414,8 +414,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->fcg_factory->generate(this->mtx); @@ -461,6 +461,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -478,6 +479,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -495,6 +497,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -512,6 +515,7 @@ TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -580,6 +584,7 @@ TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -597,6 +602,7 @@ TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index 888cbc3b4fe..1afe813c12b 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -274,7 +274,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixed) solver->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -319,7 +319,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixedComplex) GKO_ASSERT_MTX_NEAR(x, l({value_type{1.0, -2.0}, value_type{3.0, -6.0}, value_type{2.0, -4.0}}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -370,7 +370,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed) solver->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), - (r_mixed()) * 1e1); + (r_mixed()) * 1e2); } @@ -449,6 +449,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -466,6 +467,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -511,6 +513,7 @@ TYPED_TEST(Gcr, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -577,6 +580,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gcr_factory_restart = Solver::build() @@ -605,6 +609,7 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto gcr_factory_preconditioner = Solver::build() .with_criteria( @@ -635,6 +640,7 @@ TYPED_TEST(Gcr, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -652,6 +658,7 @@ TYPED_TEST(Gcr, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 585fec833bc..e34ffc95e23 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -422,7 +422,7 @@ TYPED_TEST(Gmres, SolvesStencilSystem) TYPED_TEST(Gmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -460,7 +460,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemComplex) TYPED_TEST(Gmres, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = @@ -516,7 +516,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -558,8 +558,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->gmres_factory->generate(this->mtx); @@ -606,6 +606,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -623,6 +624,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -640,6 +642,7 @@ TYPED_TEST(Gmres, SolveWithImplicitResNormCritIsDisabled) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -654,6 +657,7 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -720,6 +724,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gmres_factory_restart = Solver::build() @@ -748,6 +753,7 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto gmres_factory_preconditioner = Solver::build() .with_criteria( @@ -778,6 +784,7 @@ TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -795,6 +802,7 @@ TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index 3e74e0c319b..7ad933142e9 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -95,7 +95,10 @@ class Idr : public ::testing::Test { std::unique_ptr idr_factory_precision; }; -TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator); +// Solves((Conj)Trans)DenseSystem((Mixed)Complex) does not work in some default +// random generator from different environments. All tests will SKIP half, so we +// do not test half here. +TYPED_TEST_SUITE(Idr, gko::test::ValueTypesNoHalf, TypenameNameGenerator); TYPED_TEST(Idr, SolvesDenseSystem) @@ -114,7 +117,8 @@ TYPED_TEST(Idr, SolvesDenseSystem) TYPED_TEST(Idr, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using T = typename TestFixture::value_type; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -129,6 +133,7 @@ TYPED_TEST(Idr, SolvesDenseSystemMixed) TYPED_TEST(Idr, SolvesDenseSystemComplex) { + using T = typename TestFixture::value_type; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); @@ -150,8 +155,8 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex) TYPED_TEST(Idr, SolvesDenseSystemMixedComplex) { - using value_type = - gko::to_complex>; + using T = typename TestFixture::value_type; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize( @@ -175,6 +180,7 @@ TYPED_TEST(Idr, SolvesDenseSystemWithComplexSubSpace) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using Solver = typename TestFixture::Solver; + // intermediate value is too small to represent in half auto half_tol = std::sqrt(r::value); auto solver_factory = Solver::build() @@ -269,8 +275,9 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; + SKIP_IF_HALF(typename TestFixture::value_type); auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -310,8 +317,8 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); @@ -338,6 +345,7 @@ TYPED_TEST(Idr, SolvesMultipleDenseSystemsUsingAdvancedApply) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_HALF(T); auto half_tol = std::sqrt(r::value); auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index 8b4255b72ef..0eb45e7026e 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -116,7 +116,7 @@ TYPED_TEST(Ir, SolvesTriangularSystem) TYPED_TEST(Ir, SolvesTriangularSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -153,7 +153,7 @@ TYPED_TEST(Ir, SolvesTriangularSystemComplex) TYPED_TEST(Ir, SolvesTriangularSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize( @@ -279,8 +279,8 @@ TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->ir_factory->generate(this->mtx); diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index ed3fff964e6..0a5d6d47e54 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -133,7 +133,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystem) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -171,7 +171,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -242,7 +242,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -284,7 +284,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 3efb9d41c5e..be19cc6f29e 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -186,7 +186,7 @@ class DummyLinOpWithFactory { auto alpha_value = gko::as>(alpha)->at(0, 0); - gko::remove_complex scale = std::real(alpha_value); + gko::remove_complex scale = gko::real(alpha_value); global_step *= static_cast(scale); step.push_back(global_step); global_step++; @@ -265,7 +265,7 @@ class Multigrid : public ::testing::Test { using Smoother = gko::solver::Ir; using InnerSolver = gko::preconditioner::Jacobi; using CoarsestSolver = gko::solver::Cg; - using CoarsestNextSolver = gko::solver::Cg>; + using CoarsestNextSolver = gko::solver::Cg>; using DummyRPFactory = DummyMultigridLevelWithFactory; using DummyFactory = DummyLinOpWithFactory; Multigrid() diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index 148c68bdcb3..915d7b8dd5e 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -133,7 +133,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystem) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -171,7 +171,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -243,7 +243,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -285,7 +285,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index 1c18fbb895d..498fea61cb8 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -398,7 +398,9 @@ TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal) ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); - solution->at(0) = rhs_val - r::value * T{1.2}; + // TODO FIXME: NVHPC calculates different result of rhs - r*1.2 from + // rhs - tmp = rhs - (r * 1.2). https://godbolt.org/z/GrGE9PE67 + solution->at(0) = rhs_val - r::value * T{1.4}; ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index edb94ef0beb..67105c8e2e2 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -67,8 +67,7 @@ class DeviceMatrixData : public CommonTestFixture { 0, host_data.size[0] - 1); std::uniform_int_distribution col_distr( 0, host_data.size[1] - 1); - std::uniform_real_distribution> - val_distr(1.0, 2.0); + std::uniform_real_distribution<> val_distr(1.0, 2.0); // add random entries for (int i = 0; i < 1000; i++) { host_data.nonzeros.emplace_back( diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 8ee0089c49c..bb7e195ad2c 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -53,7 +53,7 @@ class FillArray : public CommonTestFixture { protected: using value_type = T; FillArray() - : total_size(63531), + : total_size(3000), vals{ref, total_size}, dvals{exec, total_size}, seqs{ref, total_size} @@ -68,8 +68,8 @@ class FillArray : public CommonTestFixture { gko::array seqs; }; -TYPED_TEST_SUITE(FillArray, gko::test::ValueAndIndexTypes, - TypenameNameGenerator); +using LIST = ::testing::Types; +TYPED_TEST_SUITE(FillArray, LIST, TypenameNameGenerator); TYPED_TEST(FillArray, EqualsReference) @@ -88,5 +88,10 @@ TYPED_TEST(FillArray, FillSeqEqualsReference) gko::kernels::EXEC_NAMESPACE::components::fill_seq_array( this->exec, this->dvals.get_data(), this->total_size); + this->dvals.set_executor(this->ref); + for (gko::size_type i = 2000; i < this->total_size; i++) { + std::cout << i << " " << this->seqs.get_data()[i] << " device " + << this->dvals.get_data()[i] << std::endl; + } GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals); } diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index 6738125ded6..490b0e90fdb 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -52,14 +52,19 @@ template class ReduceArray : public CommonTestFixture { protected: using value_type = T; + // In bfloat16, 256 + 1 -> 256. The reference gets 256 but parallel version + // doesn't due to ordering and grouping. ReduceArray() - : total_size(6355), + : total_size( + (std::is_same, gko::bfloat16>::value) + ? 254 + : 1024), out{ref, I{2}}, dout{exec, out}, vals{ref, total_size}, dvals{exec} { - std::fill_n(vals.get_data(), total_size, 3); + std::fill_n(vals.get_data(), total_size, 1); dvals = vals; } diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 6e907acaa37..62751309f08 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -73,8 +73,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(0, 10.0), - rand_engine, ref); + std::normal_distribution<>(0, 10.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); dmtx_l_ani_init = Csr::create(exec); @@ -139,6 +138,8 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 6f85229bb8a..c379324172f 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -79,15 +79,11 @@ class ParIct : public CommonTestFixture { mtx = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); @@ -159,6 +155,8 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index a6c0706e6ee..22f4b740fd7 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -90,8 +90,7 @@ class ParIlu : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(0, num_cols - 1), - std::normal_distribution>(0.0, 1.0), - rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } std::unique_ptr gen_unsorted_mtx(index_type num_rows, @@ -277,6 +276,8 @@ TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef) TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; @@ -295,6 +296,7 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index e457515e72c..2bd53c19717 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -80,39 +80,27 @@ class ParIlut : public CommonTestFixture { mtx1 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx2 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(0, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_square = gko::test::generate_random_matrix( mtx_size[0], mtx_size[0], std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l2 = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], true, std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_u = gko::test::generate_random_upper_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx1 = gko::clone(exec, mtx1); dmtx2 = gko::clone(exec, mtx2); @@ -166,7 +154,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto size = index_type(mtx->get_num_stored_elements()); using ValueType = typename Mtx::value_type; @@ -221,7 +209,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto res = Mtx::create(ref, mtx_size); auto dres = Mtx::create(exec, mtx_size); auto res_coo = Coo::create(ref, mtx_size); @@ -408,6 +396,8 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + // there's one value larger than half range + SKIP_IF_HALF(value_type); auto square_size = this->mtx_square->get_size(); auto mtx_lu = Csr::create(this->ref, square_size); this->mtx_l2->apply(this->mtx_u, mtx_lu); @@ -436,6 +426,8 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); auto mtx_u_coo = Coo::create(this->ref, square_size); diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index ee32c52a358..3571a9db5fb 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -69,7 +69,7 @@ class Fbcsr : public CommonTestFixture { std::unique_ptr rsorted; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -83,7 +83,10 @@ class Fbcsr : public CommonTestFixture { for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) { xarr[i] = static_cast(2.0) * - std::sin(static_cast(i / 2.0) + get_random_value()); + static_cast( + std::sin(static_cast< + typename gko::detail::arth_type::type>( + static_cast(i / 2.0) + get_random_value()))); } } }; @@ -155,6 +158,10 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -177,6 +184,10 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); @@ -200,6 +211,10 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -230,6 +245,10 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + SKIP_IF_BFLOAT16(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index fd9dda821c0..1d59d687a66 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -122,7 +122,8 @@ class Fft : public CommonTestFixture { }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, + TypenameNameGenerator); TYPED_TEST(Fft, Apply1DIsEqualToReference) diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 9192b2eeebe..b75626daefc 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -588,7 +588,7 @@ class Matrix : public CommonTestFixture { using Mtx = typename T::matrix_type; using index_type = typename Mtx::index_type; using value_type = typename Mtx::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense; @@ -617,10 +617,7 @@ class Matrix : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -640,10 +637,7 @@ class Matrix : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 7b72f4aeaab..1814750f803 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -239,12 +239,10 @@ class Matrix : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); beta = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); } void SetUp() override { ASSERT_EQ(comm.size(), 3); } @@ -284,14 +282,12 @@ class Matrix : public CommonMpiTestFixture { num_rows, num_cols, std::uniform_int_distribution(static_cast(num_cols), static_cast(num_cols)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto mat_md = gko::test::generate_random_matrix_data( num_rows, num_rows, std::uniform_int_distribution(0, static_cast(num_rows)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto row_mapping = gko::test::generate_random_array< gko::experimental::distributed::comm_index_type>( @@ -364,9 +360,9 @@ TYPED_TEST(Matrix, CanApplyToMultipleVectors) using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::global_index_type; auto vec_md = gko::matrix_data{ - I>{{1, 11}, {2, 22}, {3, 33}, {4, 44}, {5, 55}}}; + I>{{1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}}}; I> result[3] = { - {{10, 110}, {18, 198}}, {{28, 308}, {67, 737}}, {{59, 649}}}; + {{10, 25}, {18, 53}}, {{28, 83}, {67, 142}}, {{59, 154}}}; auto rank = this->comm.rank(); this->x->read_distributed(vec_md, this->col_part); this->y->read_distributed(vec_md, this->row_part); @@ -442,7 +438,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); @@ -468,7 +464,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index 59462a9be59..b04cbf53b87 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -74,7 +74,7 @@ template struct SimpleSolverTest { using solver_type = SolverType; using value_type = typename solver_type::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using local_index_type = gko::int32; using global_index_type = gko::int64; using dist_matrix_type = @@ -213,7 +213,7 @@ class Solver : public CommonMpiTestFixture { using local_index_type = typename T::local_index_type; using global_index_type = typename T::global_index_type; using value_type = typename T::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = typename T::dist_vector_type; using LocalVec = typename T::non_dist_vector_type; using MixedVec = typename T::mixed_dist_vector_type; @@ -252,10 +252,7 @@ class Solver : public CommonMpiTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -282,10 +279,7 @@ class Solver : public CommonMpiTestFixture { { return gko::share(gko::initialize( {gko::test::detail::get_rand_value( - std::normal_distribution< - gko::remove_complex>(0.0, - 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, exec)); } diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index a7ad735458c..3c568d8208c 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -545,7 +545,7 @@ class VectorReductions : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes, +TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypesNoHalf, TypenameNameGenerator); @@ -770,8 +770,7 @@ class VectorLocalOps : public CommonMpiTestFixture { local_size[0], local_size[1], std::uniform_int_distribution(local_size[1], local_size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); dist = DistVectorType::create(exec, comm, size, gko::clone(local)); } @@ -783,8 +782,7 @@ class VectorLocalOps : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, size[1], std::uniform_int_distribution(size[1], size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); } void init_complex_vectors() @@ -847,7 +845,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported) TYPED_TEST(VectorLocalOps, ConvertsToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -863,7 +861,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision) TYPED_TEST(VectorLocalOps, MovesToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -978,8 +976,7 @@ TYPED_TEST(VectorLocalOps, FillSameAsLocal) { using value_type = typename TestFixture::value_type; auto value = gko::test::detail::get_rand_value( - std::normal_distribution>(), - this->engine); + std::normal_distribution<>(), this->engine); this->init_vectors(); this->x->fill(value); diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index 0a30f7ba67f..c1c14901a56 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -82,9 +82,7 @@ class Direct : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } void initialize_data(const char* mtx_filename, int nrhs) diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index b6f228c13f5..2e09c64ba18 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -560,7 +560,7 @@ class Solver : public CommonTestFixture { using Precond = typename T::precond_type; using Mtx = typename T::matrix_type; using value_type = typename Mtx::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense; @@ -589,10 +589,7 @@ class Solver : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -613,10 +610,7 @@ class Solver : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } @@ -1001,6 +995,7 @@ TYPED_TEST(Solver, MixedApplyIsEquivalentToRef) solver.ref->apply(b.ref, x.ref); solver.dev->apply(b.dev, x.dev); + // TODO: in double with half, 4 iterations leads inf GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol(x)); }); }); @@ -1019,6 +1014,7 @@ TYPED_TEST(Solver, MixedAdvancedApplyIsEquivalentToRef) solver.ref->apply(alpha.ref, b.ref, beta.ref, x.ref); solver.dev->apply(alpha.dev, b.dev, beta.dev, x.dev); + // TODO: in double with half, 4 iterations leads inf GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol(x)); }); });