diff --git a/projects/2021/groupK_burgers_openmp/HPC4WC.pdf b/projects/2021/groupK_burgers_openmp/HPC4WC.pdf new file mode 100644 index 00000000..8170bad2 Binary files /dev/null and b/projects/2021/groupK_burgers_openmp/HPC4WC.pdf differ diff --git a/projects/2021/groupK_burgers_openmp/branch-master/CMakeLists.txt b/projects/2021/groupK_burgers_openmp/branch-master/CMakeLists.txt new file mode 100644 index 00000000..20635e44 --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-master/CMakeLists.txt @@ -0,0 +1,27 @@ +cmake_minimum_required(VERSION 3.14) +project(HPWCProject) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include(FetchContent) + +find_package(Eigen3 3.4 QUIET) +if(NOT EIGEN_FOUND) + FetchContent_Declare( + Eigen + GIT_REPOSITORY https://gitlab.com/libeigen/eigen + GIT_TAG 3.4 + ) + FetchContent_MakeAvailable(Eigen) +endif() + +FetchContent_Declare( + ProgramOptionsHxx + GIT_REPOSITORY https://github.com/Fytch/ProgramOptions.hxx +) +FetchContent_MakeAvailable(ProgramOptionsHxx) + +add_executable(HPWCProject main.cpp) +target_link_libraries(HPWCProject Eigen3::Eigen ProgramOptionsHxx) diff --git a/projects/2021/groupK_burgers_openmp/branch-master/main.cpp b/projects/2021/groupK_burgers_openmp/branch-master/main.cpp new file mode 100644 index 00000000..67a61360 --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-master/main.cpp @@ -0,0 +1,495 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Scalar = double; +using TemplateParamScalar = +#if __cpp_nontype_template_args >= 201911 + Scalar +#else + long int +#endif +; +// using Vector3 = Eigen::Matrix; +using Vector = Eigen::Matrix; +using Vector2 = Eigen::Matrix; +using Matrix = Eigen::Matrix; + +template +concept Tensoroid = true; // CBA +template +concept Matrixoid = requires(T const& m) { + { m(0, 0) } -> std::convertible_to; + { m.rows() } -> std::integral; + { m.cols() } -> std::integral; + // more stuff but CBA +}; +template +concept Vectoroid = requires(T const& v) { + { v[0] } -> std::convertible_to; + // more stuff but CBA +}; + +namespace Detail { + template + struct IsEigenSequence : std::false_type {}; + template + struct IsEigenSequence> : std::true_type {}; +} +template +concept Sequenceoid = Detail::IsEigenSequence::value; + +template +[[nodiscard]] constexpr T sq(T const& x) { + return x * x; +} +[[nodiscard]] constexpr auto absi(std::signed_integral auto x) noexcept { + assert(x != std::numeric_limits::min()); + return x < 0 ? -x : x; +} +[[nodiscard]] constexpr auto absi(std::unsigned_integral auto x) noexcept { + return x; +} +[[nodiscard]] constexpr unsigned exp2i(unsigned x) noexcept { + assert(x < std::numeric_limits::digits); + return 1<; + +template +struct ScopeGuard { + Invocable invocable; + [[nodiscard]] explicit constexpr ScopeGuard(Invocable&& invocable) noexcept(std::is_nothrow_move_constructible_v) : invocable(std::move(invocable)) {} + [[nodiscard]] explicit constexpr ScopeGuard(Invocable const& invocable) noexcept(std::is_nothrow_copy_constructible_v) : invocable(invocable) {} + constexpr ~ScopeGuard() noexcept(std::is_nothrow_invocable_v) { invocable(); } +}; + +template +struct Rank1Slice { + Tensor* tensor; + std::array position; + + [[nodiscard]] constexpr Rank1Slice(Tensor& tensor, std::convertible_to auto const&... indices) + : tensor(std::addressof(tensor)), position{ indices... } { + static_assert(sizeof...(indices) == ranks); + } + + // not multi-thread friendly + [[nodiscard]] constexpr decltype(auto) operator[](IndexType i) const { + assert(tensor); + const_cast(this)->position[slicedRank] += i; + const ScopeGuard restore([this, i]{ const_cast(this)->position[slicedRank] -= i; }); + return std::apply(*tensor, position); + } +}; +template +[[nodiscard]] constexpr auto rank1Slice(Tensoroid auto& tensor, std::integral auto const&... indices) { + return Rank1Slice, sizeof...(indices), slicedRank, std::common_type_t...>>(tensor, indices...); +} + +template typename F, typename I, typename... Args> +struct FoldR; +template typename F, typename I, typename T> +struct FoldR : std::type_identity::type> {}; +template typename F, typename I, typename T, typename U, typename... Tail> +struct FoldR : std::type_identity::type>::type> {}; + +template +struct IndexCoeffPair { + static constexpr auto index = index_; + static constexpr auto factor = factor_; +}; +template +struct IsIndexCoeffPair : std::false_type {}; +template +struct IsIndexCoeffPair> : std::true_type {}; +template +concept IndexCoeff = IsIndexCoeffPair::value; + +template +struct IndexMin : std::type_identity> {}; +template +struct IndexMax : std::type_identity Rhs::index), Lhs, Rhs>> {}; + +enum class StencilType { asymmetric, symmetric, antisymmetric }; +template +struct Stencil /* 1D only */ { + static constexpr auto min_index = FoldR::max(), {}>, Pairs...>::type::index; + static constexpr auto max_index = FoldR::min(), {}>, Pairs...>::type::index; + + static constexpr auto left = type == StencilType::asymmetric ? min_index : -std::max(absi(min_index), max_index); + static constexpr auto right = type == StencilType::asymmetric ? max_index : std::max(absi(min_index), max_index); + static constexpr auto margin = sizeof...(Pairs) > 0 ? std::max(absi(left), right) : 0; + + [[nodiscard]] static constexpr auto apply(Vectoroid auto&& vector) { + switch(type) { + case StencilType::asymmetric: + return ((Pairs::factor * vector[Pairs::index]) + ...); + case StencilType::symmetric: + return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] + vector[-Pairs::index] : vector[Pairs::index])) + ...); + case StencilType::antisymmetric: + return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] - vector[-Pairs::index] : vector[Pairs::index])) + ...); + }; + } +}; + +class VectorField2 { + Matrix uv[2]; + +public: + [[nodiscard]] VectorField2() = default; + [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols) + : uv{ Matrix(rows, cols), Matrix(rows, cols) } { + } + [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols, Scalar fill) + : uv{ Matrix::Constant(rows, cols, fill), Matrix::Constant(rows, cols, fill) } { + } + + [[nodiscard]] constexpr Matrix const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; } + [[nodiscard]] constexpr Matrix & operator[](int i) noexcept { assert(0 <= i && i < 2); return uv[i]; } + + [[nodiscard]] constexpr Eigen::Index rows() const noexcept { return uv[0].rows(); } + [[nodiscard]] constexpr Eigen::Index cols() const noexcept { return uv[0].cols(); } +}; + +using AdvectionStencil1 = Stencil, + IndexCoeffPair<2, TemplateParamScalar(- 9)>, + IndexCoeffPair<3, TemplateParamScalar( 1)>>; +using AdvectionStencil2 = Stencil, + IndexCoeffPair<1, TemplateParamScalar( 15)>, + IndexCoeffPair<2, TemplateParamScalar(- 6)>, + IndexCoeffPair<3, TemplateParamScalar( 1)>>; + +using DiffusionStencil = Stencil, + IndexCoeffPair<1, TemplateParamScalar( 16)>, + IndexCoeffPair<2, TemplateParamScalar(- 1)>>; + +constexpr int margin = std::max({ AdvectionStencil1::margin, AdvectionStencil2::margin, DiffusionStencil::margin }); + +[[nodiscard]] constexpr Scalar advection(Scalar d, Scalar w, Vectoroid auto const& phiSlice) { + return + ( w * AdvectionStencil1::apply(phiSlice) + - std::abs(w) * AdvectionStencil2::apply(phiSlice)) / (60 * d); +} +template +void advection(Vectoroid auto const& d, Matrixoid auto const& w, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable auto assignment) { + const int nx = w.cols(), ny = w.rows(); + assert(phi.cols() == nx && phi.rows() == ny); + assert(out.cols() == nx && out.rows() == ny); + // #pragma omp parallel for schedule(static) + for(int x = margin; x < nx - margin; ++x) + for(int y = margin; y < ny - margin; ++y) + assignment(out(y, x), advection(d[advectRank], w(y, x), rank1Slice(phi, y, x))); +} +void advection(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& adv) { + advection<0>(d, uv[0], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + advection<1>(d, uv[1], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); + + advection<0>(d, uv[0], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + advection<1>(d, uv[1], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); +} + +[[nodiscard]] constexpr Scalar diffusion(Scalar d, Vectoroid auto const& phiSlice) { + return 1 / (12 * sq(d)) * DiffusionStencil::apply(phiSlice); +} +template +void diffusion(Vectoroid auto const& d, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable auto assignment) { + const int nx = phi.cols(), ny = phi.rows(); + assert(out.cols() == nx && out.rows() == ny); + // #pragma omp parallel for schedule(static) + for(int x = margin; x < nx - margin; ++x) + for(int y = margin; y < ny - margin; ++y) + assignment(out(y, x), diffusion(d[diffuseRank], rank1Slice(phi, y, x))); +} +void diffusion(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& diff) { + diffusion<0>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + diffusion<1>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); + + diffusion<0>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + diffusion<1>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); +} + +void rk_stage( + Scalar mu, + VectorField2 const& in_now, + VectorField2 const& in_tmp, + VectorField2& out, + VectorField2& adv, + VectorField2& diff, + Scalar dt, + Vectoroid auto const& d +) { + advection(d, in_tmp, adv); + diffusion(d, in_tmp, diff); + out[0] = in_now[0] + dt * (-adv[0] + mu * diff[0]); + out[1] = in_now[1] + dt * (-adv[1] + mu * diff[1]); +} + +enum UseCase { zhao, hopf_cole }; + +[[nodiscard]] constexpr VectorField2 solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, Sequenceoid auto slice_x, Sequenceoid auto slice_y) { + const int mi = slice_x.size() ? slice_x[slice_x.size() - 1] - slice_x[0] + 1 : 0; + const int mj = slice_y.size() ? slice_y[slice_y.size() - 1] - slice_y[0] + 1 : 0; + + const Matrixoid auto x2d = x(slice_x).replicate(1, mj); + const Matrixoid auto y2d = y(slice_y).transpose().replicate(mi, 1); + + const int nx = x2d.cols(), ny = x2d.rows(); + assert(y2d.cols() == nx && y2d.rows() == ny); + + using std::exp; + VectorField2 result; + switch(useCase) { + case zhao: + result[0] = -4 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().cos() * (pi * y2d).array().sin() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin()); + result[1] = -2 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().cos() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin()); + break; + case hopf_cole: + result[0] = 0.75 - 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu))); + result[1] = 0.75 + 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu))); + break; + default: + assert(false); + } + return result; +} +[[nodiscard]] constexpr decltype(auto) solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y) { + return solution_factory(useCase, mu, t, x, y, Eigen::seq(0, x.size() - 1), Eigen::seq(0, y.size() - 1)); +} + +[[nodiscard]] constexpr VectorField2 initial_solution(UseCase useCase, Scalar mu, Matrixoid auto const& x, Matrixoid auto const& y) { + return solution_factory(useCase, mu, 0, x, y); +} + +void enforce_boundary_conditions(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, VectorField2& uv) { + const int nx = x.size(), ny = y.size(); + + { + auto slice_x = Eigen::seq(0, margin - 1); + auto slice_y = Eigen::seq(0, ny - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } + { + auto slice_x = Eigen::seq(nx - margin, nx - 1); + auto slice_y = Eigen::seq(0, ny - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } + { + auto slice_x = Eigen::seq(margin, nx - margin - 1); + auto slice_y = Eigen::seq(0, margin - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } + { + auto slice_x = Eigen::seq(margin, nx - margin - 1); + auto slice_y = Eigen::seq(ny - margin, ny - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } +} + +// TODO: only a makeshift because my stdlib doesn't implement this +template +std::basic_ostream& operator<<(std::basic_ostream& os, std::chrono::duration const& d) { + std::basic_ostringstream s; + s.flags(os.flags()); + s.imbue(os.getloc()); + s.precision(os.precision()); + s << d.count(); + if constexpr(std::is_same_v) s << "ns"; + else if constexpr(std::is_same_v) s << "µs"; + else if constexpr(std::is_same_v) s << "ms"; + else if constexpr(std::is_same_v>) s << "s"; + else { + s << '[' << Period::type::num; + if constexpr(Period::type::den != 1) + s << '/' << Period::type::den; + s << ']'; + } + return os << s.str(); +} + +template +struct WelfordVariance { + std::size_t n = 0; + T mean{}; + T m{}; + + constexpr void update(T const& x) { + ++n; + const T old_mean = mean; + mean = ((n - 1) * old_mean + x) / n; + m += (x - old_mean) * (x - mean); + } + + [[nodiscard]] constexpr T variance() { return m / n; } + [[nodiscard]] constexpr T sample_variance() { return m / (n - 1); } +}; + +struct BenchmarkReport { + using Duration = std::chrono::duration; + + Duration best; + Duration average; + Duration stddev; + unsigned runs; +}; +std::ostream& operator<<(std::ostream& stream, BenchmarkReport const& report) { + stream << "runtime: " << report.average << " ± " << report.stddev << "; best: " << report.best << "; n=" << report.runs << '\n'; + return stream; +} + +template +[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak, Clock clock) { + assert(streak > 0); + using std::sqrt; + BenchmarkReport result; + WelfordVariance variance; + result.best = BenchmarkReport::Duration(std::numeric_limits::max()); + for(unsigned current_streak = 0; current_streak < streak; ) { + const auto start = clock.now(); + invocable(); + const auto end = clock.now(); + const auto duration = std::chrono::duration_cast(end - start); + if(duration < result.best) { + result.best = duration; + current_streak = 0; + } else { + ++current_streak; + } + variance.update(duration.count()); + } + result.average = BenchmarkReport::Duration(variance.mean); + result.stddev = BenchmarkReport::Duration(sqrt(variance.sample_variance())); + result.runs = variance.n; + return result; +} +template +[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak = 100) { + if constexpr(std::chrono::high_resolution_clock::is_steady) + return benchmark(std::forward(invocable), streak, std::chrono::high_resolution_clock{}); + else + return benchmark(std::forward(invocable), streak, std::chrono::steady_clock{}); +} + +int main(int argc, char** argv) +try { + unsigned int sideLength = 21; + unsigned int iterationsOption = -1; + unsigned int streak = 100; + + po::parser parser; + auto& help = parser["help"] + .abbreviation('?') + .description("print this help screen"); + parser["sideLength"] + .abbreviation('l') + .description("set the grid side length (default: 21)") + .bind(sideLength); + parser["iterations"] + .abbreviation('i') + .description("set the grid side length (default: (sideLength-1)^2)") + .bind(iterationsOption); + parser["streak"] + .abbreviation('s') + .description("set the length of a streak required to escape the benchmarking algorithm (default: 100)") + .bind(streak); + + if(!parser(argc, argv)) + return EXIT_FAILURE; + + if(help.was_set()) { + std::cout << parser << '\n'; + return EXIT_SUCCESS; + } + + if(sideLength < 1) + throw std::range_error("sideLength shall be >= 1"); + + if(streak < 1) + throw std::range_error("streak shall be >= 1"); + + // use case + constexpr UseCase useCase = zhao; + + // diffusion coefficient + const Scalar mu = 0.1; + + // time + const Scalar cfl = 1; + const Scalar timestep = cfl / sq(sideLength - 1); + const int n_iter = iterationsOption != -1 ? iterationsOption : sq(sideLength - 1); + + // output + const int print_period = 0; + + // grid + const int nx = sideLength; + const Vector x = Vector::LinSpaced(nx, 0, 1); + const Scalar dx = 1.0 / (nx - 1); + const int ny = sideLength; + const Vector y = Vector::LinSpaced(ny, 0, 1); + const Scalar dy = 1.0 / (ny - 1); + + // vector fields + VectorField2 uv_now(ny, nx); + VectorField2 uv_new(ny, nx); + VectorField2 adv(ny, nx, 0); + VectorField2 diff(ny, nx, 0); + + const auto rk_fractions = { 1.0 / 3, 1.0 / 2, 1.0 }; + + const auto single_threaded_task = [&]{ + uv_now = VectorField2(ny, nx, 0); + uv_new = initial_solution(useCase, mu, x, y); + + Scalar t = 0; + for(int i = 0; i < n_iter; ++i) { + uv_now = uv_new; + + for(const double rk_fraction : rk_fractions) { + const Scalar dt = rk_fraction * timestep; + rk_stage(mu, uv_now, uv_new, uv_new, adv, diff, dt, Vector2{ dx, dy }); + enforce_boundary_conditions(useCase, mu, t + dt, x, y, uv_new); + } + + t += timestep; + if(print_period > 0 && ((i+1) % print_period == 0 || i+1 == n_iter)) { + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y); + const Scalar err_u = (uv_new[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy); + const Scalar err_v = (uv_new[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy); + std::cout << "Iteration " << std::right << std::setfill(' ') << std::setw(6) << i + 1; + std::cout << std::scientific << std::setprecision(4); + std::cout << ": ||u - uex|| = " << err_u << " m/s, ||v - vex|| = " << err_v << " m/s\n"; + std::cout << std::defaultfloat; + } + } + }; + std::cout << "Single-threaded benchmark:\n" << benchmark(single_threaded_task, streak); +} catch(std::exception const& exception) { + std::cerr << po::red << "exception"; + std::cerr << ": " << exception.what() << '\n'; + return EXIT_FAILURE; +} diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/CMakeLists.txt b/projects/2021/groupK_burgers_openmp/branch-omp/CMakeLists.txt new file mode 100644 index 00000000..aa88cbcc --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/CMakeLists.txt @@ -0,0 +1,29 @@ +cmake_minimum_required(VERSION 3.14) +project(HPWCProject) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +include(FetchContent) + +find_package(Eigen3 3.4 QUIET) +if(NOT EIGEN_FOUND) + FetchContent_Declare( + Eigen + GIT_REPOSITORY https://gitlab.com/libeigen/eigen + GIT_TAG 3.4 + ) + FetchContent_MakeAvailable(Eigen) +endif() + +FetchContent_Declare( + ProgramOptionsHxx + GIT_REPOSITORY https://github.com/Fytch/ProgramOptions.hxx +) +FetchContent_MakeAvailable(ProgramOptionsHxx) + +find_package(OpenMP REQUIRED) + +add_executable(HPWCProject main.cpp) +target_link_libraries(HPWCProject Eigen3::Eigen ProgramOptionsHxx OpenMP::OpenMP_CXX) diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/bench.sh b/projects/2021/groupK_burgers_openmp/branch-omp/bench.sh new file mode 100755 index 00000000..73484687 --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/bench.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +for (( i=1; i <= $1; ++i )) do + ./HPWCProject -l480 -i229 -s20 -n"$i" +done diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/main.cpp b/projects/2021/groupK_burgers_openmp/branch-omp/main.cpp new file mode 100644 index 00000000..3de91dfd --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/main.cpp @@ -0,0 +1,511 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Scalar = double; +using TemplateParamScalar = +#if __cpp_nontype_template_args >= 201911 + Scalar +#else + long int +#endif +; +// using Vector3 = Eigen::Matrix; +using Vector = Eigen::Matrix; +using Vector2 = Eigen::Matrix; +using Matrix = Eigen::Matrix; + +template +concept Tensoroid = true; // CBA +template +concept Matrixoid = requires(T const& m) { + { m(0, 0) } -> std::convertible_to; + { m.rows() } -> std::integral; + { m.cols() } -> std::integral; + // more stuff but CBA +}; +template +concept Vectoroid = requires(T const& v) { + { v[0] } -> std::convertible_to; + // more stuff but CBA +}; + +namespace Detail { + template + struct IsEigenSequence : std::false_type {}; + template + struct IsEigenSequence> : std::true_type {}; +} +template +concept Sequenceoid = Detail::IsEigenSequence::value; + +template +[[nodiscard]] constexpr T sq(T const& x) { + return x * x; +} +[[nodiscard]] constexpr auto absi(std::signed_integral auto x) noexcept { + assert(x != std::numeric_limits::min()); + return x < 0 ? -x : x; +} +[[nodiscard]] constexpr auto absi(std::unsigned_integral auto x) noexcept { + return x; +} +[[nodiscard]] constexpr unsigned exp2i(unsigned x) noexcept { + assert(x < std::numeric_limits::digits); + return 1<; + +template +struct ScopeGuard { + Invocable invocable; + [[nodiscard]] explicit constexpr ScopeGuard(Invocable&& invocable) noexcept(std::is_nothrow_move_constructible_v) : invocable(std::move(invocable)) {} + [[nodiscard]] explicit constexpr ScopeGuard(Invocable const& invocable) noexcept(std::is_nothrow_copy_constructible_v) : invocable(invocable) {} + constexpr ~ScopeGuard() noexcept(std::is_nothrow_invocable_v) { invocable(); } +}; + +template +struct Rank1Slice { + Tensor* tensor; + std::array position; + + [[nodiscard]] constexpr Rank1Slice(Tensor& tensor, std::convertible_to auto const&... indices) + : tensor(std::addressof(tensor)), position{ indices... } { + static_assert(sizeof...(indices) == ranks); + } + + // not multi-thread friendly + [[nodiscard]] constexpr decltype(auto) operator[](IndexType i) const { + assert(tensor); + const_cast(this)->position[slicedRank] += i; + const ScopeGuard restore([this, i]{ const_cast(this)->position[slicedRank] -= i; }); + return std::apply(*tensor, position); + } +}; +template +[[nodiscard]] constexpr auto rank1Slice(Tensoroid auto& tensor, std::integral auto const&... indices) { + return Rank1Slice, sizeof...(indices), slicedRank, std::common_type_t...>>(tensor, indices...); +} + +template typename F, typename I, typename... Args> +struct FoldR; +template typename F, typename I, typename T> +struct FoldR : std::type_identity::type> {}; +template typename F, typename I, typename T, typename U, typename... Tail> +struct FoldR : std::type_identity::type>::type> {}; + +template +struct IndexCoeffPair { + static constexpr auto index = index_; + static constexpr auto factor = factor_; +}; +template +struct IsIndexCoeffPair : std::false_type {}; +template +struct IsIndexCoeffPair> : std::true_type {}; +template +concept IndexCoeff = IsIndexCoeffPair::value; + +template +struct IndexMin : std::type_identity> {}; +template +struct IndexMax : std::type_identity Rhs::index), Lhs, Rhs>> {}; + +enum class StencilType { asymmetric, symmetric, antisymmetric }; +template +struct Stencil /* 1D only */ { + static constexpr auto min_index = FoldR::max(), {}>, Pairs...>::type::index; + static constexpr auto max_index = FoldR::min(), {}>, Pairs...>::type::index; + + static constexpr auto left = type == StencilType::asymmetric ? min_index : -std::max(absi(min_index), max_index); + static constexpr auto right = type == StencilType::asymmetric ? max_index : std::max(absi(min_index), max_index); + static constexpr auto margin = sizeof...(Pairs) > 0 ? std::max(absi(left), right) : 0; + + [[nodiscard]] static constexpr auto apply(Vectoroid auto&& vector) { + switch(type) { + case StencilType::asymmetric: + return ((Pairs::factor * vector[Pairs::index]) + ...); + case StencilType::symmetric: + return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] + vector[-Pairs::index] : vector[Pairs::index])) + ...); + case StencilType::antisymmetric: + return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] - vector[-Pairs::index] : vector[Pairs::index])) + ...); + }; + } +}; + +class VectorField2 { + Matrix uv[2]; + +public: + [[nodiscard]] VectorField2() = default; + [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols) + : uv{ Matrix(rows, cols), Matrix(rows, cols) } { + } + [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols, Scalar fill) + : uv{ Matrix::Constant(rows, cols, fill), Matrix::Constant(rows, cols, fill) } { + } + + [[nodiscard]] constexpr Matrix const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; } + [[nodiscard]] constexpr Matrix & operator[](int i) noexcept { assert(0 <= i && i < 2); return uv[i]; } + + [[nodiscard]] constexpr Eigen::Index rows() const noexcept { return uv[0].rows(); } + [[nodiscard]] constexpr Eigen::Index cols() const noexcept { return uv[0].cols(); } +}; + +using AdvectionStencil1 = Stencil, + IndexCoeffPair<2, TemplateParamScalar(- 9)>, + IndexCoeffPair<3, TemplateParamScalar( 1)>>; +using AdvectionStencil2 = Stencil, + IndexCoeffPair<1, TemplateParamScalar( 15)>, + IndexCoeffPair<2, TemplateParamScalar(- 6)>, + IndexCoeffPair<3, TemplateParamScalar( 1)>>; + +using DiffusionStencil = Stencil, + IndexCoeffPair<1, TemplateParamScalar( 16)>, + IndexCoeffPair<2, TemplateParamScalar(- 1)>>; + +constexpr int margin = std::max({ AdvectionStencil1::margin, AdvectionStencil2::margin, DiffusionStencil::margin }); + +[[nodiscard]] constexpr Scalar advection(Scalar d, Scalar w, Vectoroid auto const& phiSlice) { + return + ( w * AdvectionStencil1::apply(phiSlice) + - std::abs(w) * AdvectionStencil2::apply(phiSlice)) / (60 * d); +} +template +void advection(Vectoroid auto const& d, Matrixoid auto const& w, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable auto assignment) { + const int nx = w.cols(), ny = w.rows(); + assert(phi.cols() == nx && phi.rows() == ny); + assert(out.cols() == nx && out.rows() == ny); + #pragma omp parallel for schedule(static) + // #pragma omp parallel for schedule(dynamic) + // #pragma omp parallel for schedule(static) collapse(2) + // #pragma omp parallel for schedule(dynamic) collapse(2) + for(int x = margin; x < nx - margin; ++x) + for(int y = margin; y < ny - margin; ++y) + assignment(out(y, x), advection(d[advectRank], w(y, x), rank1Slice(phi, y, x))); +} +void advection(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& adv) { + advection<0>(d, uv[0], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + advection<1>(d, uv[1], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); + + advection<0>(d, uv[0], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + advection<1>(d, uv[1], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); +} + +[[nodiscard]] constexpr Scalar diffusion(Scalar d, Vectoroid auto const& phiSlice) { + return 1 / (12 * sq(d)) * DiffusionStencil::apply(phiSlice); +} +template +void diffusion(Vectoroid auto const& d, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable auto assignment) { + const int nx = phi.cols(), ny = phi.rows(); + assert(out.cols() == nx && out.rows() == ny); + #pragma omp parallel for schedule(static) + // #pragma omp parallel for schedule(dynamic) + // #pragma omp parallel for schedule(static) collapse(2) + // #pragma omp parallel for schedule(dynamic) collapse(2) + for(int x = margin; x < nx - margin; ++x) + for(int y = margin; y < ny - margin; ++y) + assignment(out(y, x), diffusion(d[diffuseRank], rank1Slice(phi, y, x))); +} +void diffusion(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& diff) { + diffusion<0>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + diffusion<1>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); + + diffusion<0>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + diffusion<1>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); +} + +void rk_stage( + Scalar mu, + VectorField2 const& in_now, + VectorField2 const& in_tmp, + VectorField2& out, + VectorField2& adv, + VectorField2& diff, + Scalar dt, + Vectoroid auto const& d +) { + advection(d, in_tmp, adv); + diffusion(d, in_tmp, diff); + out[0] = in_now[0] + dt * (-adv[0] + mu * diff[0]); + out[1] = in_now[1] + dt * (-adv[1] + mu * diff[1]); +} + +enum UseCase { zhao, hopf_cole }; + +[[nodiscard]] constexpr VectorField2 solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, Sequenceoid auto slice_x, Sequenceoid auto slice_y) { + const int mi = slice_x.size() ? slice_x[slice_x.size() - 1] - slice_x[0] + 1 : 0; + const int mj = slice_y.size() ? slice_y[slice_y.size() - 1] - slice_y[0] + 1 : 0; + + const Matrixoid auto x2d = x(slice_x).replicate(1, mj); + const Matrixoid auto y2d = y(slice_y).transpose().replicate(mi, 1); + + const int nx = x2d.cols(), ny = x2d.rows(); + assert(y2d.cols() == nx && y2d.rows() == ny); + + using std::exp; + VectorField2 result; + switch(useCase) { + case zhao: + result[0] = -4 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().cos() * (pi * y2d).array().sin() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin()); + result[1] = -2 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().cos() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin()); + break; + case hopf_cole: + result[0] = 0.75 - 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu))); + result[1] = 0.75 + 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu))); + break; + default: + assert(false); + } + return result; +} +[[nodiscard]] constexpr decltype(auto) solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y) { + return solution_factory(useCase, mu, t, x, y, Eigen::seq(0, x.size() - 1), Eigen::seq(0, y.size() - 1)); +} + +[[nodiscard]] constexpr VectorField2 initial_solution(UseCase useCase, Scalar mu, Matrixoid auto const& x, Matrixoid auto const& y) { + return solution_factory(useCase, mu, 0, x, y); +} + +void enforce_boundary_conditions(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, VectorField2& uv) { + const int nx = x.size(), ny = y.size(); + + { + auto slice_x = Eigen::seq(0, margin - 1); + auto slice_y = Eigen::seq(0, ny - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } + { + auto slice_x = Eigen::seq(nx - margin, nx - 1); + auto slice_y = Eigen::seq(0, ny - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } + { + auto slice_x = Eigen::seq(margin, nx - margin - 1); + auto slice_y = Eigen::seq(0, margin - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } + { + auto slice_x = Eigen::seq(margin, nx - margin - 1); + auto slice_y = Eigen::seq(ny - margin, ny - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y); + uv[0](slice_x, slice_y) = std::move(uv_ex[0]); + uv[1](slice_x, slice_y) = std::move(uv_ex[1]); + } +} + +// TODO: only a makeshift because my stdlib doesn't implement this +template +std::basic_ostream& operator<<(std::basic_ostream& os, std::chrono::duration const& d) { + std::basic_ostringstream s; + s.flags(os.flags()); + s.imbue(os.getloc()); + s.precision(os.precision()); + s << d.count(); + if constexpr(std::is_same_v) s << "ns"; + else if constexpr(std::is_same_v) s << "µs"; + else if constexpr(std::is_same_v) s << "ms"; + else if constexpr(std::is_same_v>) s << "s"; + else { + s << '[' << Period::type::num; + if constexpr(Period::type::den != 1) + s << '/' << Period::type::den; + s << ']'; + } + return os << s.str(); +} + +template +struct WelfordVariance { + std::size_t n = 0; + T mean{}; + T m{}; + + constexpr void update(T const& x) { + ++n; + const T old_mean = mean; + mean = ((n - 1) * old_mean + x) / n; + m += (x - old_mean) * (x - mean); + } + + [[nodiscard]] constexpr T variance() { return m / n; } + [[nodiscard]] constexpr T sample_variance() { return m / (n - 1); } +}; + +struct BenchmarkReport { + using Duration = std::chrono::duration; + + Duration best; + Duration average; + Duration stddev; + unsigned runs; +}; +std::ostream& operator<<(std::ostream& stream, BenchmarkReport const& report) { + stream << "runtime: " << report.average << " ± " << report.stddev << "; best: " << report.best << "; n=" << report.runs << '\n'; + return stream; +} + +template +[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak, Clock clock) { + assert(streak > 0); + using std::sqrt; + BenchmarkReport result; + WelfordVariance variance; + result.best = BenchmarkReport::Duration(std::numeric_limits::max()); + for(unsigned current_streak = 0; current_streak < streak; ) { + const auto start = clock.now(); + invocable(); + const auto end = clock.now(); + const auto duration = std::chrono::duration_cast(end - start); + if(duration < result.best) { + result.best = duration; + current_streak = 0; + } else { + ++current_streak; + } + variance.update(duration.count()); + } + result.average = BenchmarkReport::Duration(variance.mean); + result.stddev = BenchmarkReport::Duration(sqrt(variance.sample_variance())); + result.runs = variance.n; + return result; +} +template +[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak = 100) { + if constexpr(std::chrono::high_resolution_clock::is_steady) + return benchmark(std::forward(invocable), streak, std::chrono::high_resolution_clock{}); + else + return benchmark(std::forward(invocable), streak, std::chrono::steady_clock{}); +} + +int main(int argc, char** argv) +try { + unsigned int threads = 1; + unsigned int sideLength = 21; + unsigned int iterationsOption = -1; + unsigned int streak = 100; + + po::parser parser; + auto& help = parser["help"] + .abbreviation('?') + .description("print this help screen"); + parser["threads"] + .abbreviation('n') + .description("set the number of threads (default: 1)") + .bind(threads); + parser["sideLength"] + .abbreviation('l') + .description("set the grid side length (default: 21)") + .bind(sideLength); + parser["iterations"] + .abbreviation('i') + .description("set the grid side length (default: (sideLength-1)^2)") + .bind(iterationsOption); + parser["streak"] + .abbreviation('s') + .description("set the length of a streak required to escape the benchmarking algorithm (default: 100)") + .bind(streak); + + if(!parser(argc, argv)) + return EXIT_FAILURE; + + if(help.was_set()) { + std::cout << parser << '\n'; + return EXIT_SUCCESS; + } + + if(threads < 1) + throw std::range_error("at least one thread required"); + omp_set_num_threads(threads); + + if(sideLength < 1) + throw std::range_error("sideLength shall be >= 1"); + + if(streak < 1) + throw std::range_error("streak shall be >= 1"); + + // use case + constexpr UseCase useCase = zhao; + + // diffusion coefficient + const Scalar mu = 0.1; + + // time + const Scalar cfl = 1; + const Scalar timestep = cfl / sq(sideLength - 1); + const int n_iter = iterationsOption != -1 ? iterationsOption : sq(sideLength - 1); + + // output + const int print_period = 0; + + // grid + const int nx = sideLength; + const Vector x = Vector::LinSpaced(nx, 0, 1); + const Scalar dx = 1.0 / (nx - 1); + const int ny = sideLength; + const Vector y = Vector::LinSpaced(ny, 0, 1); + const Scalar dy = 1.0 / (ny - 1); + + // vector fields + VectorField2 uv_now(ny, nx); + VectorField2 uv_new(ny, nx); + VectorField2 adv(ny, nx, 0); + VectorField2 diff(ny, nx, 0); + + const auto rk_fractions = { 1.0 / 3, 1.0 / 2, 1.0 }; + + const auto task = [&]{ + uv_now = VectorField2(ny, nx, 0); + uv_new = initial_solution(useCase, mu, x, y); + + Scalar t = 0; + for(int i = 0; i < n_iter; ++i) { + uv_now = uv_new; + + for(const double rk_fraction : rk_fractions) { + const Scalar dt = rk_fraction * timestep; + rk_stage(mu, uv_now, uv_new, uv_new, adv, diff, dt, Vector2{ dx, dy }); + enforce_boundary_conditions(useCase, mu, t + dt, x, y, uv_new); + } + + t += timestep; + if(print_period > 0 && ((i+1) % print_period == 0 || i+1 == n_iter)) { + VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y); + const Scalar err_u = (uv_new[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy); + const Scalar err_v = (uv_new[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy); + std::cout << "Iteration " << std::right << std::setfill(' ') << std::setw(6) << i + 1; + std::cout << std::scientific << std::setprecision(4); + std::cout << ": ||u - uex|| = " << err_u << " m/s, ||v - vex|| = " << err_v << " m/s\n"; + std::cout << std::defaultfloat; + } + } + }; + std::cout << threads << "-threaded benchmark:" << benchmark(task, streak); +} catch(std::exception const& exception) { + std::cerr << po::red << "exception"; + std::cerr << ": " << exception.what() << '\n'; + return EXIT_FAILURE; +} diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/plot b/projects/2021/groupK_burgers_openmp/branch-omp/plot new file mode 100644 index 00000000..6e13ac7d --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/plot @@ -0,0 +1,22 @@ +# just to get GPVAL_Y_MAX +p "results-shallow-static.txt" using 1:2 \ +, "results-shallow-dynamic.txt" using 1:2 \ +, "results-collapsed-static.txt" using 1:2 \ +#, "results-collapsed-dynamic.txt" using 1:2 + +set term svg +set output "plot.svg" + +set xlabel "Threads" +set ylabel "Runtime [ms]" + +set xrange [1:24] +set yrange [0:GPVAL_Y_MAX] + +set arrow from 12, graph 0 to 12, graph 1 nohead lw 0.5 lc rgb "0xC0C0C0" + +p 1761.55 title "single-threaded" lt black \ +, "results-shallow-static.txt" using 1:2 w l title "shallow, static" \ +, "results-shallow-dynamic.txt" using 1:2 w l title "shallow, dynamic" \ +, "results-collapsed-static.txt" using 1:2 w l title "collapsed, static" \ +#, "results-collapsed-dynamic.txt" using 1:2 w l title "collapsed, dynamic" \ diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/plot.svg b/projects/2021/groupK_burgers_openmp/branch-omp/plot.svg new file mode 100644 index 00000000..887be669 --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/plot.svg @@ -0,0 +1,198 @@ + + + +Gnuplot +Produced by GNUPLOT 5.4 patchlevel 2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + + + 500 + + + + + 1000 + + + + + 1500 + + + + + 2000 + + + + + 2500 + + + + + 3000 + + + + + 3500 + + + + + 4000 + + + + + 5 + + + + + 10 + + + + + 15 + + + + + 20 + + + + + + + + + + + + + Runtime [ms] + + + + + Threads + + + + + single-threaded + + + single-threaded + + + + + + shallow, static + + + shallow, static + + + + + + shallow, dynamic + + + shallow, dynamic + + + + + + collapsed, static + + + collapsed, static + + + + + + + + + + + + + + + + + + diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-dynamic.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-dynamic.txt new file mode 100644 index 00000000..5bf5552f --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-dynamic.txt @@ -0,0 +1,24 @@ +1 12935.7 360.757 12680.6 2 +2 111735 161.192 111621 2 +3 121632 270.072 121441 2 +4 124374 156.605 124180 5 +5 102743 17.7151 102730 2 +6 88622.3 136.157 88526 2 +7 78737.6 232.754 78573 2 +8 70705 135.711 70553.9 4 +9 66842.6 88.6509 66779.9 2 +10 64278.5 154.226 64169.4 2 +11 62228.7 183.461 62099 2 +12 60467.4 59.3568 60425.4 2 +13 58008.2 10.2265 58001 2 +14 54951.5 29.5089 54918.2 4 +15 52639.1 34.7557 52614.5 2 +16 50617.5 129.173 50526.2 2 +17 48262.1 6.0242 48257.9 2 +18 46833.2 28.7177 46812.9 2 +19 45230 12.5256 45221.1 2 +20 43992.6 0.927018 43991.9 2 +21 42174.7 7.70656 42169.2 2 +22 40838.7 21.838 40823.3 2 +23 39799.1 53.1698 39733.3 5 +24 42077 286.367 41874.5 2 diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-static.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-static.txt new file mode 100644 index 00000000..9bc040ab --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-static.txt @@ -0,0 +1,24 @@ +1 3981.12 56.9674 3914.87 38 +2 2426.23 36.3124 2363.07 23 +3 1922.95 58.7132 1856.75 24 +4 1636.56 64.1493 1571.75 51 +5 1518.76 50.4952 1421.02 21 +6 1412.58 70.5845 1275.9 39 +7 1323.95 42.6127 1248.95 42 +8 1249.74 28.4438 1206.39 60 +9 1293.59 38.2212 1208.93 48 +10 1231.64 35.2432 1160.04 30 +11 1222.88 30.9767 1162.71 39 +12 1213.74 25.8987 1175.75 30 +13 1364.55 17.689 1320.49 30 +14 1546.98 8.85813 1535.59 33 +15 1333.81 113.325 1224.58 47 +16 1523.74 27.3059 1460.9 24 +17 1466.7 2.91745 1460.98 33 +18 1415.08 68.6933 1248.24 30 +19 1463.17 15.9095 1447.6 27 +20 1393.29 84.6648 1290.35 50 +21 1461.23 18.6694 1433.07 23 +22 1427.35 27.7095 1398.9 23 +23 1583.25 55.8459 1523.57 24 +24 2268.06 137.569 2103.68 22 diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-dynamic.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-dynamic.txt new file mode 100644 index 00000000..a1a65ce1 --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-dynamic.txt @@ -0,0 +1,24 @@ +1 1820.68 28.8543 1788.09 42 +2 1800.82 45.1005 1719.79 30 +3 1652.87 24.5901 1625.1 38 +4 1709.06 16.931 1677.26 29 +5 1506.15 22.8942 1472.38 50 +6 1485.3 26.5845 1457.99 33 +7 1573.28 61.0053 1468.64 42 +8 1523.51 28.4464 1471.67 21 +9 1496.66 65.6791 1397.31 36 +10 1499.12 49.2789 1399.24 26 +11 1456.79 36.713 1402.69 59 +12 1480.78 26.3377 1452.96 35 +13 1625.52 192.755 1387.4 99 +14 1565.94 176.501 1395.46 63 +15 1583.1 168.616 1386.43 60 +16 1584.13 181.155 1410.02 72 +17 1540.47 166.82 1407.7 66 +18 1565.17 161.84 1422.28 32 +19 1495.8 185.809 1364.82 62 +20 1667.31 184.946 1396.48 23 +21 1807.79 10.7196 1782.79 21 +22 1808.32 10.3299 1795.53 28 +23 1793.4 18.6433 1767.01 30 +24 2145.88 70.3782 2041.5 33 diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-static.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-static.txt new file mode 100644 index 00000000..c56251df --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-static.txt @@ -0,0 +1,24 @@ +1 1823.55 26.2263 1782.73 23 +2 1401.58 19.8054 1349.46 96 +3 1199.78 20.8571 1167.98 30 +4 1141.46 19.449 1111.81 24 +5 1122.95 22.5619 1092.07 38 +6 1015.55 47.5705 972.33 47 +7 1016.55 13.0833 995.12 24 +8 1034.47 26.4825 953.92 32 +9 1028.39 73.6957 949.22 66 +10 1021.82 30.7757 985.06 42 +11 1029.58 23.5031 996.72 54 +12 1049.21 28.4955 1012.32 27 +13 1148.73 32.048 1099.74 33 +14 1226.55 116.99 1060.86 42 +15 1104.32 64.5803 1048.97 48 +16 1381.24 11.6949 1363.26 36 +17 1246.86 125.312 1062.78 57 +18 1258.27 123.22 1072.89 39 +19 1362.96 19.4314 1333.75 33 +20 1214.86 133.691 1055.18 44 +21 1339.66 28.7203 1311.06 48 +22 1374.55 57.2419 1347.98 33 +23 1527.07 139.481 1365.03 37 +24 1943.73 351.194 1581.44 36 diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-single.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-single.txt new file mode 100644 index 00000000..7771829d --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-single.txt @@ -0,0 +1 @@ +1 1761.55 15.7786 1734.09 227 diff --git a/projects/2021/groupK_burgers_openmp/branch-upcxx/main.cpp b/projects/2021/groupK_burgers_openmp/branch-upcxx/main.cpp new file mode 100644 index 00000000..5c7e07ce --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-upcxx/main.cpp @@ -0,0 +1,725 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Scalar = double; +using TemplateParamScalar = +#if __cpp_nontype_template_args >= 201911 + Scalar +#else + long int +#endif +; +// using Vector3 = Eigen::Matrix; +using Vector = Eigen::Matrix; +using Vector2 = Eigen::Matrix; +using Matrix = Eigen::Matrix; + +template +concept Tensoroid = true; // CBA +template +concept Matrixoid = requires(T const& m) { + { m(0, 0) } -> std::convertible_to; + { m.rows() } -> std::integral; + { m.cols() } -> std::integral; + // more stuff but CBA +}; +template +concept VectorField = requires(T const& m) { + { m[0](0, 0) } -> std::convertible_to; + { m[0] } -> Matrixoid; + { m.rows() } -> std::integral; + { m.cols() } -> std::integral; + // more stuff but CBA +}; +template +concept Vectoroid = requires(T const& v) { + { v[0] } -> std::convertible_to; + // more stuff but CBA +}; + +namespace Detail { + template + struct IsEigenSequence : std::false_type {}; + template + struct IsEigenSequence> : std::true_type {}; +} +template +concept Sequenceoid = Detail::IsEigenSequence::value; + +template +[[nodiscard]] constexpr T sq(T const& x) { + return x * x; +} +[[nodiscard]] constexpr auto absi(std::signed_integral auto x) noexcept { + assert(x != std::numeric_limits::min()); + return x < 0 ? -x : x; +} +[[nodiscard]] constexpr auto absi(std::unsigned_integral auto x) noexcept { + return x; +} +[[nodiscard]] constexpr unsigned exp2i(unsigned x) noexcept { + assert(x < std::numeric_limits::digits); + return 1<; + +template +struct ScopeGuard { + Invocable invocable; + [[nodiscard]] explicit constexpr ScopeGuard(Invocable&& invocable) noexcept(std::is_nothrow_move_constructible_v) : invocable(std::move(invocable)) {} + [[nodiscard]] explicit constexpr ScopeGuard(Invocable const& invocable) noexcept(std::is_nothrow_copy_constructible_v) : invocable(invocable) {} + constexpr ~ScopeGuard() noexcept(std::is_nothrow_invocable_v) { invocable(); } +}; + +template +struct Rank1Slice { + Tensor* tensor; + std::array position; + + [[nodiscard]] constexpr Rank1Slice(Tensor& tensor, std::convertible_to auto const&... indices) + : tensor(std::addressof(tensor)), position{ indices... } { + static_assert(sizeof...(indices) == ranks); + } + + // not multi-thread friendly + [[nodiscard]] constexpr decltype(auto) operator[](IndexType i) const { + assert(tensor); + const_cast(this)->position[slicedRank] += i; + const ScopeGuard restore([this, i]{ const_cast(this)->position[slicedRank] -= i; }); + return std::apply(*tensor, position); + } +}; +template +[[nodiscard]] constexpr auto rank1Slice(Tensoroid auto& tensor, std::integral auto const&... indices) { + return Rank1Slice, sizeof...(indices), slicedRank, std::common_type_t...>>(tensor, indices...); +} + +template typename F, typename I, typename... Args> +struct FoldR; +template typename F, typename I, typename T> +struct FoldR : std::type_identity::type> {}; +template typename F, typename I, typename T, typename U, typename... Tail> +struct FoldR : std::type_identity::type>::type> {}; + +template +struct IndexCoeffPair { + static constexpr auto index = index_; + static constexpr auto factor = factor_; +}; +template +struct IsIndexCoeffPair : std::false_type {}; +template +struct IsIndexCoeffPair> : std::true_type {}; +template +concept IndexCoeff = IsIndexCoeffPair::value; + +template +struct IndexMin : std::type_identity> {}; +template +struct IndexMax : std::type_identity Rhs::index), Lhs, Rhs>> {}; + +enum class StencilType { asymmetric, symmetric, antisymmetric }; +template +struct Stencil /* 1D only */ { + static constexpr auto min_index = FoldR::max(), {}>, Pairs...>::type::index; + static constexpr auto max_index = FoldR::min(), {}>, Pairs...>::type::index; + + static constexpr auto left = type == StencilType::asymmetric ? min_index : -std::max(absi(min_index), max_index); + static constexpr auto right = type == StencilType::asymmetric ? max_index : std::max(absi(min_index), max_index); + static constexpr auto margin = sizeof...(Pairs) > 0 ? std::max(absi(left), right) : 0; + + [[nodiscard]] static constexpr auto apply(Vectoroid auto&& vector) { + switch(type) { + case StencilType::asymmetric: + return ((Pairs::factor * vector[Pairs::index]) + ...); + case StencilType::symmetric: + return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] + vector[-Pairs::index] : vector[Pairs::index])) + ...); + case StencilType::antisymmetric: + return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] - vector[-Pairs::index] : vector[Pairs::index])) + ...); + }; + } +}; + +using AdvectionStencil1 = Stencil, + IndexCoeffPair<2, TemplateParamScalar(- 9)>, + IndexCoeffPair<3, TemplateParamScalar( 1)>>; +using AdvectionStencil2 = Stencil, + IndexCoeffPair<1, TemplateParamScalar( 15)>, + IndexCoeffPair<2, TemplateParamScalar(- 6)>, + IndexCoeffPair<3, TemplateParamScalar( 1)>>; + +using DiffusionStencil = Stencil, + IndexCoeffPair<1, TemplateParamScalar( 16)>, + IndexCoeffPair<2, TemplateParamScalar(- 1)>>; + +constexpr int margin = std::max({ AdvectionStencil1::margin, AdvectionStencil2::margin, DiffusionStencil::margin }); + +enum UseCase { zhao, hopf_cole }; + +class VectorField2 { + Matrix uv[2]; + +public: + [[nodiscard]] VectorField2() = default; + [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols) + : uv{ Matrix(rows, cols), Matrix(rows, cols) } { + } + [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols, Scalar fill) + : uv{ Matrix::Constant(rows, cols, fill), Matrix::Constant(rows, cols, fill) } { + } + + [[nodiscard]] constexpr Matrix const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; } + [[nodiscard]] constexpr Matrix & operator[](int i) noexcept { assert(0 <= i && i < 2); return uv[i]; } + + [[nodiscard]] constexpr Eigen::Index rows() const noexcept { return uv[0].rows(); } + [[nodiscard]] constexpr Eigen::Index cols() const noexcept { return uv[0].cols(); } +}; + +[[nodiscard]] constexpr VectorField2 solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, Sequenceoid auto slice_x, Sequenceoid auto slice_y) { + const int mi = slice_x.size() ? slice_x[slice_x.size() - 1] - slice_x[0] + 1 : 0; + const int mj = slice_y.size() ? slice_y[slice_y.size() - 1] - slice_y[0] + 1 : 0; + + const Matrixoid auto x2d = x(slice_x).transpose().replicate(mj, 1); + const Matrixoid auto y2d = y(slice_y).replicate(1, mi); + + const int nx = x2d.cols(), ny = x2d.rows(); + assert(nx == slice_x.size()); + assert(y2d.cols() == nx && y2d.rows() == ny); + + using std::exp; + VectorField2 result; + switch(useCase) { + case zhao: + result[0] = -4 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().cos() * (pi * y2d).array().sin() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin()); + result[1] = -2 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().cos() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin()); + break; + case hopf_cole: + result[0] = 0.75 - 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu))); + result[1] = 0.75 + 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu))); + break; + default: + assert(false); + } + return result; +} +[[nodiscard]] constexpr decltype(auto) solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y) { + return solution_factory(useCase, mu, t, x, y, Eigen::seq(0, x.size() - 1), Eigen::seq(0, y.size() - 1)); +} + +[[nodiscard]] constexpr VectorField2 initial_solution(UseCase useCase, Scalar mu, Matrixoid auto const& x, Matrixoid auto const& y) { + return solution_factory(useCase, mu, 0, x, y); +} + +Vector padVector(Vectoroid auto input, int targetLength, Scalar paddingValue){ + assert(targetLength >= input.rows()); + Vector padded(targetLength); + padded << input, paddingValue * Vector::Ones(targetLength - input.rows()); + return padded; +} + +struct DistributedConfig { + + explicit DistributedConfig(int32_t globalWidth, int32_t globalHeight, int32_t ranksWidth, int32_t ranksHeight, int32_t rankIndex, int32_t margin) : + RANKS_TOTAL(ranksWidth * ranksHeight), + RANKS_WIDTH(ranksWidth), + RANKS_HEIGHT(ranksHeight), + RANK_INDEX(rankIndex), + RANK_X(rankIndex % ranksWidth), + RANK_Y(rankIndex / ranksWidth), + GLOBAL_WIDTH(globalWidth), + GLOBAL_HEIGHT(globalHeight), + LOCAL_WIDTH((globalWidth + ranksWidth - 1) / ranksWidth), + LOCAL_HEIGHT((globalHeight + ranksHeight - 1) / ranksHeight), + STORED_WIDTH(LOCAL_WIDTH + 2 * margin), + STORED_HEIGHT(LOCAL_HEIGHT + 2 * margin), + MARGIN(margin), + DX(1.0 / (globalWidth - 1)), + DY(1.0 / (globalHeight - 1)), + GLOBAL_X(Vector::LinSpaced(globalWidth, 0, 1)), + GLOBAL_Y(Vector::LinSpaced(globalHeight, 0, 1)), + LOCAL_X(padVector(GLOBAL_X, LOCAL_WIDTH * RANKS_WIDTH, 1.0).segment(RANK_X * LOCAL_WIDTH, LOCAL_WIDTH)), + LOCAL_Y(padVector(GLOBAL_Y, LOCAL_HEIGHT * RANKS_HEIGHT, 1.0).segment(RANK_Y * LOCAL_HEIGHT, LOCAL_HEIGHT)) + { + USED_WIDTH = LOCAL_WIDTH; + if (RANK_X == RANKS_WIDTH - 1) { + USED_WIDTH = GLOBAL_WIDTH - RANK_X * LOCAL_WIDTH; + } + USED_HEIGHT = LOCAL_HEIGHT; + if (RANK_Y == RANKS_HEIGHT - 1) { + USED_HEIGHT = GLOBAL_HEIGHT - RANK_Y * LOCAL_HEIGHT; + } + if (USED_WIDTH < MARGIN || USED_HEIGHT < MARGIN) { + std::cerr << "Invalid segmentation " << USED_WIDTH << "x" << USED_HEIGHT << "\n"; + std::abort(); + } + } + + // Number of rank matrix blocks + int32_t RANKS_TOTAL; + + // Number of rank matrix blocks per global matrix width + int32_t RANKS_WIDTH; + + // Number of rank matrix blocks per global matrix height + int32_t RANKS_HEIGHT; + + // Index of local rank + int32_t RANK_INDEX; + + // X coordinate of local rank's matrix block + int32_t RANK_X; + + // Y coordinate of local rank's matrix block + int32_t RANK_Y; + + // Width of entire matrix that is being distributed + int32_t GLOBAL_WIDTH; + + // Height of entire matrix that is being distributed + int32_t GLOBAL_HEIGHT; + + // Width of matrix block that is local to this rank + int32_t LOCAL_WIDTH; + + // Height of matrix block that is local to this rank + int32_t LOCAL_HEIGHT; + + // Width of matrix block that is used on this rank, i.e. without the extra cells on the right border + int32_t USED_WIDTH; + + // Height of matrix block that is used on this rank, i.e. without the extra cells on the bottom border + int32_t USED_HEIGHT; + + // Width of matrix that is local to this rank including margins + int32_t STORED_WIDTH; + + // Height of matrix that is local to this rank including margins + int32_t STORED_HEIGHT; + + // Margin around each local matrix for communication + int32_t MARGIN; + + // X coordinate spacing + Scalar DX; + + // Y coordinate spacing + Scalar DY; + + // X coordinates of entire global matrix + Vector GLOBAL_X; + + // Y coordinates of entire global matrix + Vector GLOBAL_Y; + + // X coordinates of local matrix block + Vector LOCAL_X; + + // Y coordinates of local matrix block + Vector LOCAL_Y; + +}; + +class DistributedVectorField2 { + DistributedConfig cfg; + std::reference_wrapper, 2>>> allMatrices; + std::vector> uv; + +public: + [[nodiscard]] explicit DistributedVectorField2(const VectorField2& o, std::vector, 2>>& allMatrices, DistributedConfig config) : + cfg(std::move(config)), + allMatrices(allMatrices) { + + for (Eigen::Index dim = 0; dim < 2; dim++) { + assert(cfg.GLOBAL_WIDTH == o[dim].cols() && cfg.GLOBAL_HEIGHT == o[dim].rows()); + allMatrices[cfg.RANK_INDEX][dim] = upcxx::allocate(cfg.STORED_WIDTH * cfg.STORED_HEIGHT, 256); + uv.emplace_back(Eigen::Map(allMatrices[cfg.RANK_INDEX][dim].local(), cfg.STORED_HEIGHT, cfg.STORED_WIDTH)); + for (Eigen::Index y = cfg.LOCAL_HEIGHT * cfg.RANK_Y; y < cfg.GLOBAL_HEIGHT && y < cfg.LOCAL_HEIGHT * (cfg.RANK_Y + 1); ++y) { + std::copy(o[dim].data() + y * cfg.GLOBAL_WIDTH + cfg.RANK_X * cfg.LOCAL_WIDTH, + o[dim].data() + y * cfg.GLOBAL_WIDTH + std::min(cfg.GLOBAL_WIDTH, (cfg.RANK_X + 1) * cfg.LOCAL_WIDTH), + uv[dim].data() + cfg.MARGIN + (y - cfg.RANK_Y * cfg.LOCAL_HEIGHT + cfg.MARGIN) * cfg.STORED_WIDTH); + } + } + } + + /** + * @return if x, y are in bounds + */ + [[nodiscard]] bool inBounds(Eigen::Index x, Eigen::Index y) { + if (x < 0) return false; + if (x >= cfg.RANKS_WIDTH) return false; + if (y < 0) return false; + if (y >= cfg.RANKS_HEIGHT) return false; + return true; + } + + /** + * Synchronize borders with a neighbour given an offset + * @return future for synchronization + */ + [[nodiscard]] upcxx::future<> synchronizeRight(UseCase useCase, Scalar mu, Scalar t) { + Eigen::Index otherX = cfg.RANK_X + 1; + Eigen::Index otherY = cfg.RANK_Y; + upcxx::future<> fut = upcxx::make_future(); + + if (inBounds(otherX, otherY)) { + for (Eigen::Index dim = 0; dim < 2; dim++) { + upcxx::global_ptr otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.STORED_WIDTH * cfg.MARGIN + cfg.MARGIN; + auto localStart = uv[dim].data() + cfg.USED_WIDTH + cfg.MARGIN + cfg.STORED_WIDTH * cfg.MARGIN; + fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, + localStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.MARGIN, cfg.USED_HEIGHT}})); + } + } else { + + auto slice_x = Eigen::seq(cfg.USED_WIDTH - margin, cfg.USED_WIDTH - 1); + auto slice_y = Eigen::seq(0, cfg.USED_HEIGHT - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y); + uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]); + uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]); + } + return fut; + } + + [[nodiscard]] upcxx::future<> synchronizeLeft(UseCase useCase, Scalar mu, Scalar t) { + Eigen::Index otherX = cfg.RANK_X - 1; + Eigen::Index otherY = cfg.RANK_Y; + upcxx::future<> fut = upcxx::make_future(); + + if (inBounds(otherX, otherY)) { + + for (Eigen::Index dim = 0; dim < 2; dim++) { + upcxx::global_ptr otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.LOCAL_WIDTH + cfg.STORED_WIDTH * cfg.MARGIN; + auto localStart = uv[dim].data() + cfg.STORED_WIDTH * cfg.MARGIN; + fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, + localStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.MARGIN, cfg.USED_HEIGHT}})); + } + } else { + + auto slice_x = Eigen::seq(0, cfg.MARGIN - 1); + auto slice_y = Eigen::seq(0, cfg.USED_HEIGHT - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y); + uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]); + uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]); + } + return fut; + } + + [[nodiscard]] upcxx::future<> synchronizeUp(UseCase useCase, Scalar mu, Scalar t) { + Eigen::Index otherX = cfg.RANK_X; + Eigen::Index otherY = cfg.RANK_Y - 1; + upcxx::future<> fut = upcxx::make_future(); + if (inBounds(otherX, otherY)) { + + for (Eigen::Index dim = 0; dim < 2; dim++) { + upcxx::global_ptr otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.MARGIN + cfg.STORED_WIDTH * cfg.LOCAL_HEIGHT; + auto localStart = uv[dim].data() + cfg.MARGIN; + fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, + localStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.USED_WIDTH, cfg.MARGIN}})); + } + } else { + + auto slice_x = Eigen::seq(0, cfg.USED_WIDTH - 1); + auto slice_y = Eigen::seq(0, cfg.MARGIN - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y); + uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]); + uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]); + } + return fut; + } + + [[nodiscard]] upcxx::future<> synchronizeDown(UseCase useCase, Scalar mu, Scalar t) { + Eigen::Index otherX = cfg.RANK_X; + Eigen::Index otherY = cfg.RANK_Y + 1; + upcxx::future<> fut = upcxx::make_future(); + if (inBounds(otherX, otherY)) { + + for (Eigen::Index dim = 0; dim < 2; dim++) { + upcxx::global_ptr otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.MARGIN + cfg.MARGIN * cfg.STORED_WIDTH; + auto localStart = uv[dim].data() + cfg.MARGIN + cfg.STORED_WIDTH * (cfg.MARGIN + cfg.USED_HEIGHT); + fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, + localStart, + {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.USED_WIDTH, cfg.MARGIN}})); + } + } else { + + auto slice_x = Eigen::seq(0, cfg.USED_WIDTH - 1); + auto slice_y = Eigen::seq(cfg.USED_HEIGHT - cfg.MARGIN, cfg.USED_HEIGHT - 1); + VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y); + uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]); + uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]); + } + return fut; + } + + [[nodiscard]] Eigen::Map const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; } + [[nodiscard]] Eigen::Map & operator[](int i) noexcept { assert(0 <= i && i < 2); return uv[i]; } + + [[nodiscard]] Eigen::Index rows() const noexcept { return uv[0].rows(); } + [[nodiscard]] Eigen::Index cols() const noexcept { return uv[0].cols(); } +}; + +[[nodiscard]] constexpr Scalar advection(Scalar d, Scalar w, Vectoroid auto const& phiSlice) { + return + ( w * AdvectionStencil1::apply(phiSlice) + - std::abs(w) * AdvectionStencil2::apply(phiSlice)) / (60 * d); +} +template +void advection(Vectoroid auto const& d, Matrixoid auto const& w, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable auto assignment) { + const int nx = w.cols(), ny = w.rows(); + assert(phi.cols() == nx && phi.rows() == ny); + assert(out.cols() == nx && out.rows() == ny); + for(int x = margin; x < nx - margin; ++x) + for(int y = margin; y < ny - margin; ++y) + assignment(out(y, x), advection(d[advectRank], w(y, x), rank1Slice(phi, y, x))); +} +void advection(Vectoroid auto const& d, VectorField auto const& uv, VectorField auto& adv) { + advection<1>(d, uv[0], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + advection<0>(d, uv[1], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); + + advection<1>(d, uv[0], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + advection<0>(d, uv[1], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); +} + +[[nodiscard]] constexpr Scalar diffusion(Scalar d, Vectoroid auto const& phiSlice) { + return 1 / (12 * sq(d)) * DiffusionStencil::apply(phiSlice); +} +template +void diffusion(Vectoroid auto const& d, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable auto assignment) { + const int nx = phi.cols(), ny = phi.rows(); + assert(out.cols() == nx && out.rows() == ny); + for(int x = margin; x < nx - margin; ++x) + for(int y = margin; y < ny - margin; ++y) + assignment(out(y, x), diffusion(d[diffuseRank], rank1Slice(phi, y, x))); +} +void diffusion(Vectoroid auto const& d, VectorField auto const& uv, VectorField auto& diff) { + diffusion<1>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + diffusion<0>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); + + diffusion<1>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs = rhs; }); + diffusion<0>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; }); +} + +void rk_stage( + Scalar mu, + VectorField auto const& in_now, + VectorField auto const& in_tmp, + VectorField auto& out, + VectorField auto& adv, + VectorField auto& diff, + Scalar dt, + Vectoroid auto const& d +) { + advection(d, in_tmp, adv); + diffusion(d, in_tmp, diff); + out[0] = in_now[0] + dt * (-adv[0] + mu * diff[0]); + out[1] = in_now[1] + dt * (-adv[1] + mu * diff[1]); +} + + +// TODO: only a makeshift because my stdlib doesn't implement this +template +std::basic_ostream& operator<<(std::basic_ostream& os, std::chrono::duration const& d) { + std::basic_ostringstream s; + s.flags(os.flags()); + s.imbue(os.getloc()); + s.precision(os.precision()); + s << d.count(); + if constexpr(std::is_same_v) s << "ns"; + else if constexpr(std::is_same_v) s << "µs"; + else if constexpr(std::is_same_v) s << "ms"; + else if constexpr(std::is_same_v>) s << "s"; + else { + s << '[' << Period::type::num; + if constexpr(Period::type::den != 1) + s << '/' << Period::type::den; + s << ']'; + } + return os << s.str(); +} + +template +struct WelfordVariance { + std::size_t n = 0; + T mean{}; + T m{}; + + constexpr void update(T const& x) { + ++n; + const T old_mean = mean; + mean = ((n - 1) * old_mean + x) / n; + m += (x - old_mean) * (x - mean); + } + + [[nodiscard]] constexpr T variance() { return m / n; } + [[nodiscard]] constexpr T sample_variance() { return m / (n - 1); } +}; + +struct BenchmarkReport { + using Duration = std::chrono::duration; + + Duration best; + Duration average; + Duration variance; + unsigned runs; +}; +std::ostream& operator<<(std::ostream& stream, BenchmarkReport const& report) { + stream << "runtime: " << report.average << " ± " << std::sqrt(report.variance.count()) << "ms" << "; best: " << report.best << "; n=" << report.runs << '\n'; + return stream; +} + +template +[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak, Clock clock) { + assert(streak > 0); + BenchmarkReport result; + WelfordVariance variance; + result.best = BenchmarkReport::Duration(std::numeric_limits::max()); + for(unsigned current_streak = 0; current_streak < streak; ) { + upcxx::barrier(); + const auto start = clock.now(); + invocable(); + const auto end = clock.now(); + auto duration = std::chrono::duration_cast(end - start); + upcxx::barrier(); + duration = upcxx::broadcast(duration, 0).wait(); + if(duration < result.best) { + result.best = duration; + current_streak = 0; + } else { + ++current_streak; + } + variance.update(duration.count()); + } + result.average = BenchmarkReport::Duration(variance.mean); + result.variance = BenchmarkReport::Duration(variance.sample_variance()); + result.runs = variance.n; + return result; +} +template +[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak = 20) { + if constexpr(std::chrono::high_resolution_clock::is_steady) + return benchmark(std::forward(invocable), streak, std::chrono::high_resolution_clock{}); + else + return benchmark(std::forward(invocable), streak, std::chrono::steady_clock{}); +} + +int main(int argc, char **argv) { + + if (argc != 5) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 2; + } + // use case + constexpr UseCase useCase = zhao; + + // diffusion coefficient + const Scalar mu = 0.1; + + // grid + const int factor = std::stol(argv[1]); + const uint32_t coreWidth = std::stol(argv[2]); + const uint32_t coreHeight = std::stol(argv[3]); + upcxx::init(); + const uint32_t coreIndex = upcxx::rank_me(); + const DistributedConfig cfg(factor, factor, coreWidth, coreHeight, coreIndex, margin); + + if (coreWidth * coreHeight != upcxx::rank_n()) { + std::cerr << "Using invalid number of ranks\n"; + return 1; + } + + // time + const Scalar cfl = 1; + const Scalar timestep = cfl / sq(cfg.GLOBAL_WIDTH-1); + const Scalar n_iter = sq(cfg.GLOBAL_WIDTH-1) * std::stol(argv[4]) / 1000; + // output + const int print_period = 0; + // vector fields + + const auto syncMatrices = [](std::vector, 2>>& mats) { + for (uint32_t i = 0; i < mats.size(); i++) { + mats[i] = upcxx::broadcast(mats[i], i).wait(); + } + }; + + const auto single_threaded_task = [&]{ + + std::vector, 2>> allMatricesNow(cfg.RANKS_TOTAL); + DistributedVectorField2 uv_now(VectorField2(cfg.GLOBAL_HEIGHT, cfg.GLOBAL_WIDTH, 0), allMatricesNow, cfg); + syncMatrices(allMatricesNow); + + std::vector, 2>> allMatricesNew(cfg.RANKS_TOTAL); + DistributedVectorField2 uv_new(initial_solution(useCase, mu, cfg.GLOBAL_X, cfg.GLOBAL_Y), allMatricesNew, cfg); + syncMatrices(allMatricesNew); + + upcxx::barrier(); + upcxx::when_all(uv_new.synchronizeRight(useCase, mu, 0), uv_new.synchronizeLeft(useCase, mu, 0), uv_new.synchronizeUp(useCase, mu, 0), uv_new.synchronizeDown(useCase, mu, 0)).wait(); + upcxx::barrier(); + + VectorField2 adv(uv_now[0].rows(), uv_now[0].cols(), 0); + VectorField2 diff(uv_now[0].rows(), uv_now[0].cols(), 0); + + const auto rk_fractions = { 1.0 / 3, 1.0 / 2, 1.0 }; + Scalar t = 0; + + for(int i = 0; i < n_iter; ++i) { + uv_now = uv_new; + + for(const double rk_fraction : rk_fractions) { + const Scalar dt = rk_fraction * timestep; + rk_stage(mu, uv_now, uv_new, uv_new, adv, diff, dt, Vector2{ cfg.DX, cfg.DY }); + upcxx::barrier(); + upcxx::when_all(uv_new.synchronizeRight(useCase, mu, t + dt), uv_new.synchronizeLeft(useCase, mu, t + dt), uv_new.synchronizeUp(useCase, mu, t + dt), uv_new.synchronizeDown(useCase, mu, t + dt)).wait(); + upcxx::barrier(); + } + + t += timestep; + if(print_period > 0 && ((i+1) % print_period == 0 || i+1 == n_iter)) { + VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X.segment(0, cfg.USED_WIDTH), cfg.LOCAL_Y.segment(0, cfg.USED_HEIGHT)); + const Scalar local_err_u = (uv_new[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH) - uv_ex[0]).squaredNorm() * cfg.DX * cfg.DY; + const Scalar local_err_v = (uv_new[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH) - uv_ex[1]).squaredNorm() * cfg.DX * cfg.DY; + const Scalar err_u = std::sqrt(upcxx::reduce_all(local_err_u, std::plus()).wait()); + const Scalar err_v = std::sqrt(upcxx::reduce_all(local_err_v, std::plus()).wait()); + upcxx::barrier(); + if (cfg.RANK_INDEX == 0) { + std::cout << "Iteration " << std::right << std::setfill(' ') << std::setw(6) << i + 1; + std::cout << std::scientific << std::setprecision(4); + std::cout << ": ||u - uex|| = " << err_u << " m/s, ||v - vex|| = " << err_v << " m/s\n"; + std::cout << std::defaultfloat; + } + } + } + }; + if (upcxx::rank_me() == 0) { + std::cout << "Time: " << benchmark(single_threaded_task); + } else { + benchmark(single_threaded_task); + } + + upcxx::finalize(); +} diff --git a/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_small_grid b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_small_grid new file mode 100644 index 00000000..bd7c891b --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_small_grid @@ -0,0 +1,10 @@ +Time: runtime: 5799.27ms ± 75.6701ms; best: 5678.92ms; n=33 +Time: runtime: 2074.05ms ± 46.3968ms; best: 1990.05ms; n=47 +Time: runtime: 1692.26ms ± 48.7114ms; best: 1609.17ms; n=43 +Time: runtime: 1392.18ms ± 57.8365ms; best: 1301.65ms; n=33 +Time: runtime: 1303.88ms ± 95.567ms; best: 1180.74ms; n=36 +Time: runtime: 5788.49ms ± 71.2364ms; best: 5660.38ms; n=45 +Time: runtime: 3537.47ms ± 59.8368ms; best: 3432.72ms; n=49 +Time: runtime: 2434.85ms ± 38.4084ms; best: 2382.31ms; n=30 +Time: runtime: 1802.55ms ± 97.6827ms; best: 1615.69ms; n=23 +Time: runtime: 1787.4ms ± 43.9987ms; best: 1720.63ms; n=48 diff --git a/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_strong_scaling b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_strong_scaling new file mode 100644 index 00000000..f53c8111 --- /dev/null +++ b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_strong_scaling @@ -0,0 +1,10 @@ +Time: runtime: 4279.9ms ± 134.979ms; best: 4030.75ms; n=59 +Time: runtime: 1100.13ms ± 28.4733ms; best: 1066.17ms; n=35 +Time: runtime: 800.246ms ± 44.5372ms; best: 753.12ms; n=28 +Time: runtime: 546.107ms ± 17.7146ms; best: 521.13ms; n=27 +Time: runtime: 497.132ms ± 43.4852ms; best: 443.236ms; n=37 +Time: runtime: 4401.04ms ± 132.005ms; best: 4201.34ms; n=29 +Time: runtime: 2190.8ms ± 45.1649ms; best: 2107.44ms; n=38 +Time: runtime: 1183.65ms ± 24.1417ms; best: 1148.65ms; n=45 +Time: runtime: 725.589ms ± 21.6998ms; best: 688.194ms; n=60 +Time: runtime: 594.244ms ± 45.5923ms; best: 564.256ms; n=32