diff --git a/projects/2021/groupK_burgers_openmp/HPC4WC.pdf b/projects/2021/groupK_burgers_openmp/HPC4WC.pdf
new file mode 100644
index 00000000..8170bad2
Binary files /dev/null and b/projects/2021/groupK_burgers_openmp/HPC4WC.pdf differ
diff --git a/projects/2021/groupK_burgers_openmp/branch-master/CMakeLists.txt b/projects/2021/groupK_burgers_openmp/branch-master/CMakeLists.txt
new file mode 100644
index 00000000..20635e44
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-master/CMakeLists.txt
@@ -0,0 +1,27 @@
+cmake_minimum_required(VERSION 3.14)
+project(HPWCProject)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+include(FetchContent)
+
+find_package(Eigen3 3.4 QUIET)
+if(NOT EIGEN_FOUND)
+	FetchContent_Declare(
+		Eigen
+		GIT_REPOSITORY https://gitlab.com/libeigen/eigen
+		GIT_TAG        3.4
+	)
+	FetchContent_MakeAvailable(Eigen)
+endif()
+
+FetchContent_Declare(
+	ProgramOptionsHxx
+	GIT_REPOSITORY https://github.com/Fytch/ProgramOptions.hxx
+)
+FetchContent_MakeAvailable(ProgramOptionsHxx)
+
+add_executable(HPWCProject main.cpp)
+target_link_libraries(HPWCProject Eigen3::Eigen ProgramOptionsHxx)
diff --git a/projects/2021/groupK_burgers_openmp/branch-master/main.cpp b/projects/2021/groupK_burgers_openmp/branch-master/main.cpp
new file mode 100644
index 00000000..67a61360
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-master/main.cpp
@@ -0,0 +1,495 @@
+#include <Eigen/Dense>
+#include <ProgramOptions.hxx>
+#include <concepts>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <numbers>
+#include <type_traits>
+#include <limits>
+#include <algorithm>
+#include <utility>
+#include <memory>
+#include <chrono>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+using Scalar = double;
+using TemplateParamScalar =
+#if __cpp_nontype_template_args >= 201911
+	Scalar
+#else
+	long int
+#endif
+;
+// using Vector3 = Eigen::Matrix<Scalar, 3, 1>;
+using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
+using Vector2 = Eigen::Matrix<Scalar, 2, 1>;
+using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+
+template<typename T>
+concept Tensoroid = true; // CBA
+template<typename T>
+concept Matrixoid = requires(T const& m) {
+	{ m(0, 0) } -> std::convertible_to<Scalar>;
+	{ m.rows() } -> std::integral;
+	{ m.cols() } -> std::integral;
+	// more stuff but CBA
+};
+template<typename T>
+concept Vectoroid = requires(T const& v) {
+	{ v[0] } -> std::convertible_to<Scalar>;
+	// more stuff but CBA
+};
+
+namespace Detail {
+	template<typename T>
+	struct IsEigenSequence : std::false_type {};
+	template<typename FirstType, typename SizeType, typename IncrType>
+	struct IsEigenSequence<Eigen::ArithmeticSequence<FirstType, SizeType, IncrType>> : std::true_type {};
+}
+template<typename T>
+concept Sequenceoid = Detail::IsEigenSequence<T>::value;
+
+template<typename T>
+[[nodiscard]] constexpr T sq(T const& x) {
+	return x * x;
+}
+[[nodiscard]] constexpr auto absi(std::signed_integral auto x) noexcept {
+	assert(x != std::numeric_limits<decltype(x)>::min());
+	return x < 0 ? -x : x;
+}
+[[nodiscard]] constexpr auto absi(std::unsigned_integral auto x) noexcept {
+	return x;
+}
+[[nodiscard]] constexpr unsigned exp2i(unsigned x) noexcept {
+	assert(x < std::numeric_limits<unsigned>::digits);
+	return 1<<x;
+}
+inline constexpr Scalar pi = std::numbers::pi_v<Scalar>;
+
+template<std::invocable Invocable>
+struct ScopeGuard {
+	Invocable invocable;
+	[[nodiscard]] explicit constexpr ScopeGuard(Invocable&& invocable) noexcept(std::is_nothrow_move_constructible_v<Invocable>) : invocable(std::move(invocable)) {}
+	[[nodiscard]] explicit constexpr ScopeGuard(Invocable const& invocable) noexcept(std::is_nothrow_copy_constructible_v<Invocable>) : invocable(invocable) {}
+	constexpr ~ScopeGuard() noexcept(std::is_nothrow_invocable_v<Invocable>) { invocable(); }
+};
+
+template<Tensoroid Tensor, unsigned ranks, unsigned slicedRank, std::integral IndexType = int>
+struct Rank1Slice {
+	Tensor* tensor;
+	std::array<IndexType, ranks> position;
+
+	[[nodiscard]] constexpr Rank1Slice(Tensor& tensor, std::convertible_to<IndexType> auto const&... indices)
+	: tensor(std::addressof(tensor)), position{ indices... } {
+		static_assert(sizeof...(indices) == ranks);
+	}
+
+	// not multi-thread friendly
+	[[nodiscard]] constexpr decltype(auto) operator[](IndexType i) const {
+		assert(tensor);
+		const_cast<Rank1Slice*>(this)->position[slicedRank] += i;
+		const ScopeGuard restore([this, i]{ const_cast<Rank1Slice*>(this)->position[slicedRank] -= i; });
+		return std::apply(*tensor, position);
+	}
+};
+template<unsigned slicedRank>
+[[nodiscard]] constexpr auto rank1Slice(Tensoroid auto& tensor, std::integral auto const&... indices) {
+	return Rank1Slice<std::remove_reference_t<decltype(tensor)>, sizeof...(indices), slicedRank, std::common_type_t<std::remove_cvref_t<decltype(indices)>...>>(tensor, indices...);
+}
+
+template<template<typename, typename> typename F, typename I, typename... Args>
+struct FoldR;
+template<template<typename, typename> typename F, typename I, typename T>
+struct FoldR<F, I, T> : std::type_identity<typename F<T, I>::type> {};
+template<template<typename, typename> typename F, typename I, typename T, typename U, typename... Tail>
+struct FoldR<F, I, T, U, Tail...> : std::type_identity<typename F<T, typename FoldR<F, I, U, Tail...>::type>::type> {};
+
+template<std::integral auto index_, TemplateParamScalar factor_>
+struct IndexCoeffPair {
+	static constexpr auto index = index_;
+	static constexpr auto factor = factor_;
+};
+template<typename T>
+struct IsIndexCoeffPair : std::false_type {};
+template<std::integral auto index, TemplateParamScalar factor>
+struct IsIndexCoeffPair<IndexCoeffPair<index, factor>> : std::true_type {};
+template<typename T>
+concept IndexCoeff = IsIndexCoeffPair<T>::value;
+
+template<typename Lhs, typename Rhs>
+struct IndexMin : std::type_identity<std::conditional_t<(Lhs::index < Rhs::index), Lhs, Rhs>> {};
+template<typename Lhs, typename Rhs>
+struct IndexMax : std::type_identity<std::conditional_t<(Lhs::index > Rhs::index), Lhs, Rhs>> {};
+
+enum class StencilType { asymmetric, symmetric, antisymmetric };
+template<StencilType type, IndexCoeff... Pairs>
+struct Stencil /* 1D only */ {
+	static constexpr auto min_index = FoldR<IndexMin, IndexCoeffPair<std::numeric_limits<int>::max(), {}>, Pairs...>::type::index;
+	static constexpr auto max_index = FoldR<IndexMax, IndexCoeffPair<std::numeric_limits<int>::min(), {}>, Pairs...>::type::index;
+
+	static constexpr auto left   = type == StencilType::asymmetric ? min_index : -std::max(absi(min_index), max_index);
+	static constexpr auto right  = type == StencilType::asymmetric ? max_index :  std::max(absi(min_index), max_index);
+	static constexpr auto margin = sizeof...(Pairs) > 0 ? std::max(absi(left), right) : 0;
+
+	[[nodiscard]] static constexpr auto apply(Vectoroid auto&& vector) {
+		switch(type) {
+		case StencilType::asymmetric:
+			return ((Pairs::factor * vector[Pairs::index]) + ...);
+		case StencilType::symmetric:
+			return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] + vector[-Pairs::index] : vector[Pairs::index])) + ...);
+		case StencilType::antisymmetric:
+			return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] - vector[-Pairs::index] : vector[Pairs::index])) + ...);
+		};
+	}
+};
+
+class VectorField2 {
+	Matrix uv[2];
+
+public:
+	[[nodiscard]] VectorField2() = default;
+	[[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols)
+	: uv{ Matrix(rows, cols), Matrix(rows, cols) } {
+	}
+	[[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols, Scalar fill)
+	: uv{ Matrix::Constant(rows, cols, fill), Matrix::Constant(rows, cols, fill) } {
+	}
+
+	[[nodiscard]] constexpr Matrix const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; }
+	[[nodiscard]] constexpr Matrix      & operator[](int i)       noexcept { assert(0 <= i && i < 2); return uv[i]; }
+
+	[[nodiscard]] constexpr Eigen::Index rows() const noexcept { return uv[0].rows(); }
+	[[nodiscard]] constexpr Eigen::Index cols() const noexcept { return uv[0].cols(); }
+};
+
+using AdvectionStencil1 = Stencil<StencilType::antisymmetric,
+	IndexCoeffPair<1, TemplateParamScalar( 45)>,
+	IndexCoeffPair<2, TemplateParamScalar(- 9)>,
+	IndexCoeffPair<3, TemplateParamScalar(  1)>>;
+using AdvectionStencil2 = Stencil<StencilType::symmetric,
+	IndexCoeffPair<0, TemplateParamScalar(-20)>,
+	IndexCoeffPair<1, TemplateParamScalar( 15)>,
+	IndexCoeffPair<2, TemplateParamScalar(- 6)>,
+	IndexCoeffPair<3, TemplateParamScalar(  1)>>;
+
+using DiffusionStencil = Stencil<StencilType::symmetric,
+	IndexCoeffPair<0, TemplateParamScalar(-30)>,
+	IndexCoeffPair<1, TemplateParamScalar( 16)>,
+	IndexCoeffPair<2, TemplateParamScalar(- 1)>>;
+
+constexpr int margin = std::max({ AdvectionStencil1::margin, AdvectionStencil2::margin, DiffusionStencil::margin });
+
+[[nodiscard]] constexpr Scalar advection(Scalar d, Scalar w, Vectoroid auto const& phiSlice) {
+	return
+		(           w  * AdvectionStencil1::apply(phiSlice)
+		 - std::abs(w) * AdvectionStencil2::apply(phiSlice)) / (60 * d);
+}
+template<std::integral auto advectRank>
+void advection(Vectoroid auto const& d, Matrixoid auto const& w, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable<Scalar&, Scalar> auto assignment) {
+	const int nx = w.cols(), ny = w.rows();
+	assert(phi.cols() == nx && phi.rows() == ny);
+	assert(out.cols() == nx && out.rows() == ny);
+	// #pragma omp parallel for schedule(static)
+	for(int x = margin; x < nx - margin; ++x)
+		for(int y = margin; y < ny - margin; ++y)
+			assignment(out(y, x), advection(d[advectRank], w(y, x), rank1Slice<advectRank>(phi, y, x)));
+}
+void advection(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& adv) {
+	advection<0>(d, uv[0], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	advection<1>(d, uv[1], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+
+	advection<0>(d, uv[0], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	advection<1>(d, uv[1], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+}
+
+[[nodiscard]] constexpr Scalar diffusion(Scalar d, Vectoroid auto const& phiSlice) {
+	return 1 / (12 * sq(d)) * DiffusionStencil::apply(phiSlice);
+}
+template<std::integral auto diffuseRank>
+void diffusion(Vectoroid auto const& d, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable<Scalar&, Scalar> auto assignment) {
+	const int nx = phi.cols(), ny = phi.rows();
+	assert(out.cols() == nx && out.rows() == ny);
+	// #pragma omp parallel for schedule(static)
+	for(int x = margin; x < nx - margin; ++x)
+		for(int y = margin; y < ny - margin; ++y)
+			assignment(out(y, x), diffusion(d[diffuseRank], rank1Slice<diffuseRank>(phi, y, x)));
+}
+void diffusion(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& diff) {
+	diffusion<0>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	diffusion<1>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+
+	diffusion<0>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	diffusion<1>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+}
+
+void rk_stage(
+	Scalar mu,
+	VectorField2 const& in_now,
+	VectorField2 const& in_tmp,
+	VectorField2& out,
+	VectorField2& adv,
+	VectorField2& diff,
+	Scalar dt,
+	Vectoroid auto const& d
+) {
+	advection(d, in_tmp, adv);
+	diffusion(d, in_tmp, diff);
+	out[0] = in_now[0] + dt * (-adv[0] + mu * diff[0]);
+	out[1] = in_now[1] + dt * (-adv[1] + mu * diff[1]);
+}
+
+enum UseCase { zhao, hopf_cole };
+
+[[nodiscard]] constexpr VectorField2 solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, Sequenceoid auto slice_x, Sequenceoid auto slice_y) {
+	const int mi = slice_x.size() ? slice_x[slice_x.size() - 1] - slice_x[0] + 1 : 0;
+	const int mj = slice_y.size() ? slice_y[slice_y.size() - 1] - slice_y[0] + 1 : 0;
+
+	const Matrixoid auto x2d = x(slice_x).replicate(1, mj);
+	const Matrixoid auto y2d = y(slice_y).transpose().replicate(mi, 1);
+
+	const int nx = x2d.cols(), ny = x2d.rows();
+	assert(y2d.cols() == nx && y2d.rows() == ny);
+
+	using std::exp;
+	VectorField2 result;
+	switch(useCase) {
+	case zhao:
+		result[0] = -4 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().cos() * (pi * y2d).array().sin() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin());
+		result[1] = -2 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().cos() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin());
+		break;
+	case hopf_cole:
+		result[0] = 0.75 - 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu)));
+		result[1] = 0.75 + 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu)));
+		break;
+	default:
+		assert(false);
+	}
+	return result;
+}
+[[nodiscard]] constexpr decltype(auto) solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y) {
+	return solution_factory(useCase, mu, t, x, y, Eigen::seq(0, x.size() - 1), Eigen::seq(0, y.size() - 1));
+}
+
+[[nodiscard]] constexpr VectorField2 initial_solution(UseCase useCase, Scalar mu, Matrixoid auto const& x, Matrixoid auto const& y) {
+	return solution_factory(useCase, mu, 0, x, y);
+}
+
+void enforce_boundary_conditions(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, VectorField2& uv) {
+	const int nx = x.size(), ny = y.size();
+
+	{
+		auto slice_x = Eigen::seq(0, margin - 1);
+		auto slice_y = Eigen::seq(0, ny - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+	{
+		auto slice_x = Eigen::seq(nx - margin, nx - 1);
+		auto slice_y = Eigen::seq(0, ny - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+	{
+		auto slice_x = Eigen::seq(margin, nx - margin - 1);
+		auto slice_y = Eigen::seq(0, margin - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+	{
+		auto slice_x = Eigen::seq(margin, nx - margin - 1);
+		auto slice_y = Eigen::seq(ny - margin, ny - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+}
+
+// TODO: only a makeshift because my stdlib doesn't implement this
+template<typename CharT, typename Traits, typename Rep, typename Period>
+std::basic_ostream<CharT, Traits>& operator<<(std::basic_ostream<CharT, Traits>& os, std::chrono::duration<Rep, Period> const& d) {
+	std::basic_ostringstream<CharT, Traits> s;
+	s.flags(os.flags());
+	s.imbue(os.getloc());
+	s.precision(os.precision());
+	s << d.count();
+	     if constexpr(std::is_same_v<typename Period::type, std::nano    >) s << "ns";
+	else if constexpr(std::is_same_v<typename Period::type, std::micro   >) s << "µs";
+	else if constexpr(std::is_same_v<typename Period::type, std::milli   >) s << "ms";
+	else if constexpr(std::is_same_v<typename Period::type, std::ratio<1>>) s << "s";
+	else {
+		s << '[' << Period::type::num;
+		if constexpr(Period::type::den != 1)
+			s << '/' << Period::type::den;
+		s << ']';
+	}
+	return os << s.str();
+}
+
+template<typename T>
+struct WelfordVariance {
+	std::size_t n = 0;
+	T mean{};
+	T m{};
+
+	constexpr void update(T const& x) {
+		++n;
+		const T old_mean = mean;
+		mean = ((n - 1) * old_mean + x) / n;
+		m += (x - old_mean) * (x - mean);
+	}
+
+	[[nodiscard]] constexpr T variance() { return m / n; }
+	[[nodiscard]] constexpr T sample_variance() { return m / (n - 1); }
+};
+
+struct BenchmarkReport {
+	using Duration = std::chrono::duration<double, std::milli>;
+
+	Duration best;
+	Duration average;
+	Duration stddev;
+	unsigned runs;
+};
+std::ostream& operator<<(std::ostream& stream, BenchmarkReport const& report) {
+	stream << "runtime: " << report.average << " ± " << report.stddev << "; best: " << report.best << "; n=" << report.runs << '\n';
+	return stream;
+}
+
+template<std::invocable Invocable, typename Clock>
+[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak, Clock clock) {
+	assert(streak > 0);
+	using std::sqrt;
+	BenchmarkReport result;
+	WelfordVariance<BenchmarkReport::Duration::rep> variance;
+	result.best = BenchmarkReport::Duration(std::numeric_limits<double>::max());
+	for(unsigned current_streak = 0; current_streak < streak; ) {
+		const auto start = clock.now();
+		invocable();
+		const auto end = clock.now();
+		const auto duration = std::chrono::duration_cast<BenchmarkReport::Duration>(end - start);
+		if(duration < result.best) {
+			result.best = duration;
+			current_streak = 0;
+		} else {
+			++current_streak;
+		}
+		variance.update(duration.count());
+	}
+	result.average = BenchmarkReport::Duration(variance.mean);
+	result.stddev = BenchmarkReport::Duration(sqrt(variance.sample_variance()));
+	result.runs = variance.n;
+	return result;
+}
+template<std::invocable Invocable>
+[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak = 100) {
+	if constexpr(std::chrono::high_resolution_clock::is_steady)
+		return benchmark(std::forward<Invocable>(invocable), streak, std::chrono::high_resolution_clock{});
+	else
+		return benchmark(std::forward<Invocable>(invocable), streak, std::chrono::steady_clock{});
+}
+
+int main(int argc, char** argv)
+try {
+	unsigned int sideLength = 21;
+	unsigned int iterationsOption = -1;
+	unsigned int streak = 100;
+
+	po::parser parser;
+	auto& help = parser["help"]
+		.abbreviation('?')
+		.description("print this help screen");
+	parser["sideLength"]
+		.abbreviation('l')
+		.description("set the grid side length (default: 21)")
+		.bind(sideLength);
+	parser["iterations"]
+		.abbreviation('i')
+		.description("set the grid side length (default: (sideLength-1)^2)")
+		.bind(iterationsOption);
+	parser["streak"]
+		.abbreviation('s')
+		.description("set the length of a streak required to escape the benchmarking algorithm (default: 100)")
+		.bind(streak);
+
+	if(!parser(argc, argv))
+		return EXIT_FAILURE;
+
+	if(help.was_set()) {
+		std::cout << parser << '\n';
+		return EXIT_SUCCESS;
+	}
+
+	if(sideLength < 1)
+		throw std::range_error("sideLength shall be >= 1");
+
+	if(streak < 1)
+		throw std::range_error("streak shall be >= 1");
+
+	// use case
+	constexpr UseCase useCase = zhao;
+
+	// diffusion coefficient
+	const Scalar mu = 0.1;
+
+	// time
+	const Scalar cfl = 1;
+	const Scalar timestep = cfl / sq(sideLength - 1);
+	const int n_iter = iterationsOption != -1 ? iterationsOption : sq(sideLength - 1);
+
+	// output
+	const int print_period = 0;
+
+	// grid
+	const int nx = sideLength;
+	const Vector x = Vector::LinSpaced(nx, 0, 1);
+	const Scalar dx = 1.0 / (nx - 1);
+	const int ny = sideLength;
+	const Vector y = Vector::LinSpaced(ny, 0, 1);
+	const Scalar dy = 1.0 / (ny - 1);
+
+	// vector fields
+	VectorField2 uv_now(ny, nx);
+	VectorField2 uv_new(ny, nx);
+	VectorField2 adv(ny, nx, 0);
+	VectorField2 diff(ny, nx, 0);
+
+	const auto rk_fractions = { 1.0 / 3, 1.0 / 2, 1.0 };
+
+	const auto single_threaded_task = [&]{
+		uv_now = VectorField2(ny, nx, 0);
+		uv_new = initial_solution(useCase, mu, x, y);
+
+		Scalar t = 0;
+		for(int i = 0; i < n_iter; ++i) {
+			uv_now = uv_new;
+
+			for(const double rk_fraction : rk_fractions) {
+				const Scalar dt = rk_fraction * timestep;
+				rk_stage(mu, uv_now, uv_new, uv_new, adv, diff, dt, Vector2{ dx, dy });
+				enforce_boundary_conditions(useCase, mu, t + dt, x, y, uv_new);
+			}
+
+			t += timestep;
+			if(print_period > 0 && ((i+1) % print_period == 0 || i+1 == n_iter)) {
+				VectorField2 uv_ex  = solution_factory(useCase, mu, t, x, y);
+				const Scalar err_u = (uv_new[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy);
+				const Scalar err_v = (uv_new[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy);
+				std::cout << "Iteration " << std::right << std::setfill(' ') << std::setw(6) << i + 1;
+				std::cout << std::scientific << std::setprecision(4);
+				std::cout << ": ||u - uex|| = " << err_u << " m/s, ||v - vex|| = " << err_v << " m/s\n";
+				std::cout << std::defaultfloat;
+			}
+		}
+	};
+	std::cout << "Single-threaded benchmark:\n" << benchmark(single_threaded_task, streak);
+} catch(std::exception const& exception) {
+	std::cerr << po::red << "exception";
+	std::cerr << ": " << exception.what() << '\n';
+	return EXIT_FAILURE;
+}
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/CMakeLists.txt b/projects/2021/groupK_burgers_openmp/branch-omp/CMakeLists.txt
new file mode 100644
index 00000000..aa88cbcc
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.14)
+project(HPWCProject)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+include(FetchContent)
+
+find_package(Eigen3 3.4 QUIET)
+if(NOT EIGEN_FOUND)
+	FetchContent_Declare(
+		Eigen
+		GIT_REPOSITORY https://gitlab.com/libeigen/eigen
+		GIT_TAG        3.4
+	)
+	FetchContent_MakeAvailable(Eigen)
+endif()
+
+FetchContent_Declare(
+	ProgramOptionsHxx
+	GIT_REPOSITORY https://github.com/Fytch/ProgramOptions.hxx
+)
+FetchContent_MakeAvailable(ProgramOptionsHxx)
+
+find_package(OpenMP REQUIRED)
+
+add_executable(HPWCProject main.cpp)
+target_link_libraries(HPWCProject Eigen3::Eigen ProgramOptionsHxx OpenMP::OpenMP_CXX)
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/bench.sh b/projects/2021/groupK_burgers_openmp/branch-omp/bench.sh
new file mode 100755
index 00000000..73484687
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/bench.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+for (( i=1; i <= $1; ++i )) do
+	./HPWCProject -l480 -i229 -s20 -n"$i"
+done
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/main.cpp b/projects/2021/groupK_burgers_openmp/branch-omp/main.cpp
new file mode 100644
index 00000000..3de91dfd
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/main.cpp
@@ -0,0 +1,511 @@
+#include <Eigen/Dense>
+#include <ProgramOptions.hxx>
+#include <omp.h>
+#include <concepts>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <numbers>
+#include <type_traits>
+#include <limits>
+#include <algorithm>
+#include <utility>
+#include <memory>
+#include <chrono>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+
+using Scalar = double;
+using TemplateParamScalar =
+#if __cpp_nontype_template_args >= 201911
+	Scalar
+#else
+	long int
+#endif
+;
+// using Vector3 = Eigen::Matrix<Scalar, 3, 1>;
+using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
+using Vector2 = Eigen::Matrix<Scalar, 2, 1>;
+using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>;
+
+template<typename T>
+concept Tensoroid = true; // CBA
+template<typename T>
+concept Matrixoid = requires(T const& m) {
+	{ m(0, 0) } -> std::convertible_to<Scalar>;
+	{ m.rows() } -> std::integral;
+	{ m.cols() } -> std::integral;
+	// more stuff but CBA
+};
+template<typename T>
+concept Vectoroid = requires(T const& v) {
+	{ v[0] } -> std::convertible_to<Scalar>;
+	// more stuff but CBA
+};
+
+namespace Detail {
+	template<typename T>
+	struct IsEigenSequence : std::false_type {};
+	template<typename FirstType, typename SizeType, typename IncrType>
+	struct IsEigenSequence<Eigen::ArithmeticSequence<FirstType, SizeType, IncrType>> : std::true_type {};
+}
+template<typename T>
+concept Sequenceoid = Detail::IsEigenSequence<T>::value;
+
+template<typename T>
+[[nodiscard]] constexpr T sq(T const& x) {
+	return x * x;
+}
+[[nodiscard]] constexpr auto absi(std::signed_integral auto x) noexcept {
+	assert(x != std::numeric_limits<decltype(x)>::min());
+	return x < 0 ? -x : x;
+}
+[[nodiscard]] constexpr auto absi(std::unsigned_integral auto x) noexcept {
+	return x;
+}
+[[nodiscard]] constexpr unsigned exp2i(unsigned x) noexcept {
+	assert(x < std::numeric_limits<unsigned>::digits);
+	return 1<<x;
+}
+inline constexpr Scalar pi = std::numbers::pi_v<Scalar>;
+
+template<std::invocable Invocable>
+struct ScopeGuard {
+	Invocable invocable;
+	[[nodiscard]] explicit constexpr ScopeGuard(Invocable&& invocable) noexcept(std::is_nothrow_move_constructible_v<Invocable>) : invocable(std::move(invocable)) {}
+	[[nodiscard]] explicit constexpr ScopeGuard(Invocable const& invocable) noexcept(std::is_nothrow_copy_constructible_v<Invocable>) : invocable(invocable) {}
+	constexpr ~ScopeGuard() noexcept(std::is_nothrow_invocable_v<Invocable>) { invocable(); }
+};
+
+template<Tensoroid Tensor, unsigned ranks, unsigned slicedRank, std::integral IndexType = int>
+struct Rank1Slice {
+	Tensor* tensor;
+	std::array<IndexType, ranks> position;
+
+	[[nodiscard]] constexpr Rank1Slice(Tensor& tensor, std::convertible_to<IndexType> auto const&... indices)
+	: tensor(std::addressof(tensor)), position{ indices... } {
+		static_assert(sizeof...(indices) == ranks);
+	}
+
+	// not multi-thread friendly
+	[[nodiscard]] constexpr decltype(auto) operator[](IndexType i) const {
+		assert(tensor);
+		const_cast<Rank1Slice*>(this)->position[slicedRank] += i;
+		const ScopeGuard restore([this, i]{ const_cast<Rank1Slice*>(this)->position[slicedRank] -= i; });
+		return std::apply(*tensor, position);
+	}
+};
+template<unsigned slicedRank>
+[[nodiscard]] constexpr auto rank1Slice(Tensoroid auto& tensor, std::integral auto const&... indices) {
+	return Rank1Slice<std::remove_reference_t<decltype(tensor)>, sizeof...(indices), slicedRank, std::common_type_t<std::remove_cvref_t<decltype(indices)>...>>(tensor, indices...);
+}
+
+template<template<typename, typename> typename F, typename I, typename... Args>
+struct FoldR;
+template<template<typename, typename> typename F, typename I, typename T>
+struct FoldR<F, I, T> : std::type_identity<typename F<T, I>::type> {};
+template<template<typename, typename> typename F, typename I, typename T, typename U, typename... Tail>
+struct FoldR<F, I, T, U, Tail...> : std::type_identity<typename F<T, typename FoldR<F, I, U, Tail...>::type>::type> {};
+
+template<std::integral auto index_, TemplateParamScalar factor_>
+struct IndexCoeffPair {
+	static constexpr auto index = index_;
+	static constexpr auto factor = factor_;
+};
+template<typename T>
+struct IsIndexCoeffPair : std::false_type {};
+template<std::integral auto index, TemplateParamScalar factor>
+struct IsIndexCoeffPair<IndexCoeffPair<index, factor>> : std::true_type {};
+template<typename T>
+concept IndexCoeff = IsIndexCoeffPair<T>::value;
+
+template<typename Lhs, typename Rhs>
+struct IndexMin : std::type_identity<std::conditional_t<(Lhs::index < Rhs::index), Lhs, Rhs>> {};
+template<typename Lhs, typename Rhs>
+struct IndexMax : std::type_identity<std::conditional_t<(Lhs::index > Rhs::index), Lhs, Rhs>> {};
+
+enum class StencilType { asymmetric, symmetric, antisymmetric };
+template<StencilType type, IndexCoeff... Pairs>
+struct Stencil /* 1D only */ {
+	static constexpr auto min_index = FoldR<IndexMin, IndexCoeffPair<std::numeric_limits<int>::max(), {}>, Pairs...>::type::index;
+	static constexpr auto max_index = FoldR<IndexMax, IndexCoeffPair<std::numeric_limits<int>::min(), {}>, Pairs...>::type::index;
+
+	static constexpr auto left   = type == StencilType::asymmetric ? min_index : -std::max(absi(min_index), max_index);
+	static constexpr auto right  = type == StencilType::asymmetric ? max_index :  std::max(absi(min_index), max_index);
+	static constexpr auto margin = sizeof...(Pairs) > 0 ? std::max(absi(left), right) : 0;
+
+	[[nodiscard]] static constexpr auto apply(Vectoroid auto&& vector) {
+		switch(type) {
+		case StencilType::asymmetric:
+			return ((Pairs::factor * vector[Pairs::index]) + ...);
+		case StencilType::symmetric:
+			return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] + vector[-Pairs::index] : vector[Pairs::index])) + ...);
+		case StencilType::antisymmetric:
+			return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] - vector[-Pairs::index] : vector[Pairs::index])) + ...);
+		};
+	}
+};
+
+class VectorField2 {
+	Matrix uv[2];
+
+public:
+	[[nodiscard]] VectorField2() = default;
+	[[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols)
+	: uv{ Matrix(rows, cols), Matrix(rows, cols) } {
+	}
+	[[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols, Scalar fill)
+	: uv{ Matrix::Constant(rows, cols, fill), Matrix::Constant(rows, cols, fill) } {
+	}
+
+	[[nodiscard]] constexpr Matrix const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; }
+	[[nodiscard]] constexpr Matrix      & operator[](int i)       noexcept { assert(0 <= i && i < 2); return uv[i]; }
+
+	[[nodiscard]] constexpr Eigen::Index rows() const noexcept { return uv[0].rows(); }
+	[[nodiscard]] constexpr Eigen::Index cols() const noexcept { return uv[0].cols(); }
+};
+
+using AdvectionStencil1 = Stencil<StencilType::antisymmetric,
+	IndexCoeffPair<1, TemplateParamScalar( 45)>,
+	IndexCoeffPair<2, TemplateParamScalar(- 9)>,
+	IndexCoeffPair<3, TemplateParamScalar(  1)>>;
+using AdvectionStencil2 = Stencil<StencilType::symmetric,
+	IndexCoeffPair<0, TemplateParamScalar(-20)>,
+	IndexCoeffPair<1, TemplateParamScalar( 15)>,
+	IndexCoeffPair<2, TemplateParamScalar(- 6)>,
+	IndexCoeffPair<3, TemplateParamScalar(  1)>>;
+
+using DiffusionStencil = Stencil<StencilType::symmetric,
+	IndexCoeffPair<0, TemplateParamScalar(-30)>,
+	IndexCoeffPair<1, TemplateParamScalar( 16)>,
+	IndexCoeffPair<2, TemplateParamScalar(- 1)>>;
+
+constexpr int margin = std::max({ AdvectionStencil1::margin, AdvectionStencil2::margin, DiffusionStencil::margin });
+
+[[nodiscard]] constexpr Scalar advection(Scalar d, Scalar w, Vectoroid auto const& phiSlice) {
+	return
+		(           w  * AdvectionStencil1::apply(phiSlice)
+		 - std::abs(w) * AdvectionStencil2::apply(phiSlice)) / (60 * d);
+}
+template<std::integral auto advectRank>
+void advection(Vectoroid auto const& d, Matrixoid auto const& w, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable<Scalar&, Scalar> auto assignment) {
+	const int nx = w.cols(), ny = w.rows();
+	assert(phi.cols() == nx && phi.rows() == ny);
+	assert(out.cols() == nx && out.rows() == ny);
+	#pragma omp parallel for schedule(static)
+	// #pragma omp parallel for schedule(dynamic)
+	// #pragma omp parallel for schedule(static) collapse(2)
+	// #pragma omp parallel for schedule(dynamic) collapse(2)
+	for(int x = margin; x < nx - margin; ++x)
+		for(int y = margin; y < ny - margin; ++y)
+			assignment(out(y, x), advection(d[advectRank], w(y, x), rank1Slice<advectRank>(phi, y, x)));
+}
+void advection(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& adv) {
+	advection<0>(d, uv[0], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	advection<1>(d, uv[1], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+
+	advection<0>(d, uv[0], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	advection<1>(d, uv[1], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+}
+
+[[nodiscard]] constexpr Scalar diffusion(Scalar d, Vectoroid auto const& phiSlice) {
+	return 1 / (12 * sq(d)) * DiffusionStencil::apply(phiSlice);
+}
+template<std::integral auto diffuseRank>
+void diffusion(Vectoroid auto const& d, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable<Scalar&, Scalar> auto assignment) {
+	const int nx = phi.cols(), ny = phi.rows();
+	assert(out.cols() == nx && out.rows() == ny);
+	#pragma omp parallel for schedule(static)
+	// #pragma omp parallel for schedule(dynamic)
+	// #pragma omp parallel for schedule(static) collapse(2)
+	// #pragma omp parallel for schedule(dynamic) collapse(2)
+	for(int x = margin; x < nx - margin; ++x)
+		for(int y = margin; y < ny - margin; ++y)
+			assignment(out(y, x), diffusion(d[diffuseRank], rank1Slice<diffuseRank>(phi, y, x)));
+}
+void diffusion(Vectoroid auto const& d, VectorField2 const& uv, VectorField2& diff) {
+	diffusion<0>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	diffusion<1>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+
+	diffusion<0>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+	diffusion<1>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+}
+
+void rk_stage(
+	Scalar mu,
+	VectorField2 const& in_now,
+	VectorField2 const& in_tmp,
+	VectorField2& out,
+	VectorField2& adv,
+	VectorField2& diff,
+	Scalar dt,
+	Vectoroid auto const& d
+) {
+	advection(d, in_tmp, adv);
+	diffusion(d, in_tmp, diff);
+	out[0] = in_now[0] + dt * (-adv[0] + mu * diff[0]);
+	out[1] = in_now[1] + dt * (-adv[1] + mu * diff[1]);
+}
+
+enum UseCase { zhao, hopf_cole };
+
+[[nodiscard]] constexpr VectorField2 solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, Sequenceoid auto slice_x, Sequenceoid auto slice_y) {
+	const int mi = slice_x.size() ? slice_x[slice_x.size() - 1] - slice_x[0] + 1 : 0;
+	const int mj = slice_y.size() ? slice_y[slice_y.size() - 1] - slice_y[0] + 1 : 0;
+
+	const Matrixoid auto x2d = x(slice_x).replicate(1, mj);
+	const Matrixoid auto y2d = y(slice_y).transpose().replicate(mi, 1);
+
+	const int nx = x2d.cols(), ny = x2d.rows();
+	assert(y2d.cols() == nx && y2d.rows() == ny);
+
+	using std::exp;
+	VectorField2 result;
+	switch(useCase) {
+	case zhao:
+		result[0] = -4 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().cos() * (pi * y2d).array().sin() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin());
+		result[1] = -2 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().cos() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin());
+		break;
+	case hopf_cole:
+		result[0] = 0.75 - 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu)));
+		result[1] = 0.75 + 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu)));
+		break;
+	default:
+		assert(false);
+	}
+	return result;
+}
+[[nodiscard]] constexpr decltype(auto) solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y) {
+	return solution_factory(useCase, mu, t, x, y, Eigen::seq(0, x.size() - 1), Eigen::seq(0, y.size() - 1));
+}
+
+[[nodiscard]] constexpr VectorField2 initial_solution(UseCase useCase, Scalar mu, Matrixoid auto const& x, Matrixoid auto const& y) {
+	return solution_factory(useCase, mu, 0, x, y);
+}
+
+void enforce_boundary_conditions(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, VectorField2& uv) {
+	const int nx = x.size(), ny = y.size();
+
+	{
+		auto slice_x = Eigen::seq(0, margin - 1);
+		auto slice_y = Eigen::seq(0, ny - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+	{
+		auto slice_x = Eigen::seq(nx - margin, nx - 1);
+		auto slice_y = Eigen::seq(0, ny - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+	{
+		auto slice_x = Eigen::seq(margin, nx - margin - 1);
+		auto slice_y = Eigen::seq(0, margin - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+	{
+		auto slice_x = Eigen::seq(margin, nx - margin - 1);
+		auto slice_y = Eigen::seq(ny - margin, ny - 1);
+		VectorField2 uv_ex = solution_factory(useCase, mu, t, x, y, slice_x, slice_y);
+		uv[0](slice_x, slice_y) = std::move(uv_ex[0]);
+		uv[1](slice_x, slice_y) = std::move(uv_ex[1]);
+	}
+}
+
+// TODO: only a makeshift because my stdlib doesn't implement this
+template<typename CharT, typename Traits, typename Rep, typename Period>
+std::basic_ostream<CharT, Traits>& operator<<(std::basic_ostream<CharT, Traits>& os, std::chrono::duration<Rep, Period> const& d) {
+	std::basic_ostringstream<CharT, Traits> s;
+	s.flags(os.flags());
+	s.imbue(os.getloc());
+	s.precision(os.precision());
+	s << d.count();
+	     if constexpr(std::is_same_v<typename Period::type, std::nano    >) s << "ns";
+	else if constexpr(std::is_same_v<typename Period::type, std::micro   >) s << "µs";
+	else if constexpr(std::is_same_v<typename Period::type, std::milli   >) s << "ms";
+	else if constexpr(std::is_same_v<typename Period::type, std::ratio<1>>) s << "s";
+	else {
+		s << '[' << Period::type::num;
+		if constexpr(Period::type::den != 1)
+			s << '/' << Period::type::den;
+		s << ']';
+	}
+	return os << s.str();
+}
+
+template<typename T>
+struct WelfordVariance {
+	std::size_t n = 0;
+	T mean{};
+	T m{};
+
+	constexpr void update(T const& x) {
+		++n;
+		const T old_mean = mean;
+		mean = ((n - 1) * old_mean + x) / n;
+		m += (x - old_mean) * (x - mean);
+	}
+
+	[[nodiscard]] constexpr T variance() { return m / n; }
+	[[nodiscard]] constexpr T sample_variance() { return m / (n - 1); }
+};
+
+struct BenchmarkReport {
+	using Duration = std::chrono::duration<double, std::milli>;
+
+	Duration best;
+	Duration average;
+	Duration stddev;
+	unsigned runs;
+};
+std::ostream& operator<<(std::ostream& stream, BenchmarkReport const& report) {
+	stream << "runtime: " << report.average << " ± " << report.stddev << "; best: " << report.best << "; n=" << report.runs << '\n';
+	return stream;
+}
+
+template<std::invocable Invocable, typename Clock>
+[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak, Clock clock) {
+	assert(streak > 0);
+	using std::sqrt;
+	BenchmarkReport result;
+	WelfordVariance<BenchmarkReport::Duration::rep> variance;
+	result.best = BenchmarkReport::Duration(std::numeric_limits<double>::max());
+	for(unsigned current_streak = 0; current_streak < streak; ) {
+		const auto start = clock.now();
+		invocable();
+		const auto end = clock.now();
+		const auto duration = std::chrono::duration_cast<BenchmarkReport::Duration>(end - start);
+		if(duration < result.best) {
+			result.best = duration;
+			current_streak = 0;
+		} else {
+			++current_streak;
+		}
+		variance.update(duration.count());
+	}
+	result.average = BenchmarkReport::Duration(variance.mean);
+	result.stddev = BenchmarkReport::Duration(sqrt(variance.sample_variance()));
+	result.runs = variance.n;
+	return result;
+}
+template<std::invocable Invocable>
+[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak = 100) {
+	if constexpr(std::chrono::high_resolution_clock::is_steady)
+		return benchmark(std::forward<Invocable>(invocable), streak, std::chrono::high_resolution_clock{});
+	else
+		return benchmark(std::forward<Invocable>(invocable), streak, std::chrono::steady_clock{});
+}
+
+int main(int argc, char** argv)
+try {
+	unsigned int threads = 1;
+	unsigned int sideLength = 21;
+	unsigned int iterationsOption = -1;
+	unsigned int streak = 100;
+
+	po::parser parser;
+	auto& help = parser["help"]
+		.abbreviation('?')
+		.description("print this help screen");
+	parser["threads"]
+		.abbreviation('n')
+		.description("set the number of threads (default: 1)")
+		.bind(threads);
+	parser["sideLength"]
+		.abbreviation('l')
+		.description("set the grid side length (default: 21)")
+		.bind(sideLength);
+	parser["iterations"]
+		.abbreviation('i')
+		.description("set the grid side length (default: (sideLength-1)^2)")
+		.bind(iterationsOption);
+	parser["streak"]
+		.abbreviation('s')
+		.description("set the length of a streak required to escape the benchmarking algorithm (default: 100)")
+		.bind(streak);
+
+	if(!parser(argc, argv))
+		return EXIT_FAILURE;
+
+	if(help.was_set()) {
+		std::cout << parser << '\n';
+		return EXIT_SUCCESS;
+	}
+
+	if(threads < 1)
+		throw std::range_error("at least one thread required");
+	omp_set_num_threads(threads);
+
+	if(sideLength < 1)
+		throw std::range_error("sideLength shall be >= 1");
+
+	if(streak < 1)
+		throw std::range_error("streak shall be >= 1");
+
+	// use case
+	constexpr UseCase useCase = zhao;
+
+	// diffusion coefficient
+	const Scalar mu = 0.1;
+
+	// time
+	const Scalar cfl = 1;
+	const Scalar timestep = cfl / sq(sideLength - 1);
+	const int n_iter = iterationsOption != -1 ? iterationsOption : sq(sideLength - 1);
+
+	// output
+	const int print_period = 0;
+
+	// grid
+	const int nx = sideLength;
+	const Vector x = Vector::LinSpaced(nx, 0, 1);
+	const Scalar dx = 1.0 / (nx - 1);
+	const int ny = sideLength;
+	const Vector y = Vector::LinSpaced(ny, 0, 1);
+	const Scalar dy = 1.0 / (ny - 1);
+
+	// vector fields
+	VectorField2 uv_now(ny, nx);
+	VectorField2 uv_new(ny, nx);
+	VectorField2 adv(ny, nx, 0);
+	VectorField2 diff(ny, nx, 0);
+
+	const auto rk_fractions = { 1.0 / 3, 1.0 / 2, 1.0 };
+
+	const auto task = [&]{
+		uv_now = VectorField2(ny, nx, 0);
+		uv_new = initial_solution(useCase, mu, x, y);
+
+		Scalar t = 0;
+		for(int i = 0; i < n_iter; ++i) {
+			uv_now = uv_new;
+
+			for(const double rk_fraction : rk_fractions) {
+				const Scalar dt = rk_fraction * timestep;
+				rk_stage(mu, uv_now, uv_new, uv_new, adv, diff, dt, Vector2{ dx, dy });
+				enforce_boundary_conditions(useCase, mu, t + dt, x, y, uv_new);
+			}
+
+			t += timestep;
+			if(print_period > 0 && ((i+1) % print_period == 0 || i+1 == n_iter)) {
+				VectorField2 uv_ex  = solution_factory(useCase, mu, t, x, y);
+				const Scalar err_u = (uv_new[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[0].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy);
+				const Scalar err_v = (uv_new[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin) - uv_ex[1].block(margin, margin, ny - 2 * margin, nx - 2 * margin)).norm() * std::sqrt(dx * dy);
+				std::cout << "Iteration " << std::right << std::setfill(' ') << std::setw(6) << i + 1;
+				std::cout << std::scientific << std::setprecision(4);
+				std::cout << ": ||u - uex|| = " << err_u << " m/s, ||v - vex|| = " << err_v << " m/s\n";
+				std::cout << std::defaultfloat;
+			}
+		}
+	};
+	std::cout << threads << "-threaded benchmark:" << benchmark(task, streak);
+} catch(std::exception const& exception) {
+	std::cerr << po::red << "exception";
+	std::cerr << ": " << exception.what() << '\n';
+	return EXIT_FAILURE;
+}
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/plot b/projects/2021/groupK_burgers_openmp/branch-omp/plot
new file mode 100644
index 00000000..6e13ac7d
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/plot
@@ -0,0 +1,22 @@
+# just to get GPVAL_Y_MAX
+p "results-shallow-static.txt" using 1:2 \
+, "results-shallow-dynamic.txt" using 1:2 \
+, "results-collapsed-static.txt" using 1:2 \
+#, "results-collapsed-dynamic.txt" using 1:2
+
+set term svg
+set output "plot.svg"
+
+set xlabel "Threads"
+set ylabel "Runtime [ms]"
+
+set xrange [1:24]
+set yrange [0:GPVAL_Y_MAX]
+
+set arrow from 12, graph 0 to 12, graph 1 nohead lw 0.5 lc rgb "0xC0C0C0"
+
+p 1761.55 title "single-threaded" lt black \
+, "results-shallow-static.txt" using 1:2 w l title "shallow, static" \
+, "results-shallow-dynamic.txt" using 1:2 w l title "shallow, dynamic" \
+, "results-collapsed-static.txt" using 1:2 w l title "collapsed, static" \
+#, "results-collapsed-dynamic.txt" using 1:2 w l title "collapsed, dynamic" \
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/plot.svg b/projects/2021/groupK_burgers_openmp/branch-omp/plot.svg
new file mode 100644
index 00000000..887be669
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/plot.svg
@@ -0,0 +1,198 @@
+<?xml version="1.0" encoding="utf-8"  standalone="no"?>
+<svg 
+ width="600" height="480"
+ viewBox="0 0 600 480"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink"
+>
+
+<title>Gnuplot</title>
+<desc>Produced by GNUPLOT 5.4 patchlevel 2 </desc>
+
+<g id="gnuplot_canvas">
+
+<rect x="0" y="0" width="600" height="480" fill="none"/>
+<defs>
+
+	<circle id='gpDot' r='0.5' stroke-width='0.5' stroke='currentColor'/>
+	<path id='gpPt0' stroke-width='0.222' stroke='currentColor' d='M-1,0 h2 M0,-1 v2'/>
+	<path id='gpPt1' stroke-width='0.222' stroke='currentColor' d='M-1,-1 L1,1 M1,-1 L-1,1'/>
+	<path id='gpPt2' stroke-width='0.222' stroke='currentColor' d='M-1,0 L1,0 M0,-1 L0,1 M-1,-1 L1,1 M-1,1 L1,-1'/>
+	<rect id='gpPt3' stroke-width='0.222' stroke='currentColor' x='-1' y='-1' width='2' height='2'/>
+	<rect id='gpPt4' stroke-width='0.222' stroke='currentColor' fill='currentColor' x='-1' y='-1' width='2' height='2'/>
+	<circle id='gpPt5' stroke-width='0.222' stroke='currentColor' cx='0' cy='0' r='1'/>
+	<use xlink:href='#gpPt5' id='gpPt6' fill='currentColor' stroke='none'/>
+	<path id='gpPt7' stroke-width='0.222' stroke='currentColor' d='M0,-1.33 L-1.33,0.67 L1.33,0.67 z'/>
+	<use xlink:href='#gpPt7' id='gpPt8' fill='currentColor' stroke='none'/>
+	<use xlink:href='#gpPt7' id='gpPt9' stroke='currentColor' transform='rotate(180)'/>
+	<use xlink:href='#gpPt9' id='gpPt10' fill='currentColor' stroke='none'/>
+	<use xlink:href='#gpPt3' id='gpPt11' stroke='currentColor' transform='rotate(45)'/>
+	<use xlink:href='#gpPt11' id='gpPt12' fill='currentColor' stroke='none'/>
+	<path id='gpPt13' stroke-width='0.222' stroke='currentColor' d='M0,1.330 L1.265,0.411 L0.782,-1.067 L-0.782,-1.076 L-1.265,0.411 z'/>
+	<use xlink:href='#gpPt13' id='gpPt14' fill='currentColor' stroke='none'/>
+	<filter id='textbox' filterUnits='objectBoundingBox' x='0' y='0' height='1' width='1'>
+	  <feFlood flood-color='white' flood-opacity='1' result='bgnd'/>
+	  <feComposite in='SourceGraphic' in2='bgnd' operator='atop'/>
+	</filter>
+	<filter id='greybox' filterUnits='objectBoundingBox' x='0' y='0' height='1' width='1'>
+	  <feFlood flood-color='lightgrey' flood-opacity='1' result='grey'/>
+	  <feComposite in='SourceGraphic' in2='grey' operator='atop'/>
+	</filter>
+</defs>
+<g fill="none" color="white" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,422.40 L89.92,422.40 M574.82,422.40 L565.82,422.40  '/>	<g transform="translate(72.53,426.30)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 0</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,371.85 L89.92,371.85 M574.82,371.85 L565.82,371.85  '/>	<g transform="translate(72.53,375.75)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 500</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,321.30 L89.92,321.30 M574.82,321.30 L565.82,321.30  '/>	<g transform="translate(72.53,325.20)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 1000</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,270.75 L89.92,270.75 M574.82,270.75 L565.82,270.75  '/>	<g transform="translate(72.53,274.65)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 1500</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,220.20 L89.92,220.20 M574.82,220.20 L565.82,220.20  '/>	<g transform="translate(72.53,224.10)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 2000</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,169.66 L89.92,169.66 M574.82,169.66 L565.82,169.66  '/>	<g transform="translate(72.53,173.56)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 2500</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,119.11 L89.92,119.11 M574.82,119.11 L565.82,119.11  '/>	<g transform="translate(72.53,123.01)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 3000</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,68.56 L89.92,68.56 M574.82,68.56 L565.82,68.56  '/>	<g transform="translate(72.53,72.46)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 3500</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,18.01 L89.92,18.01 M574.82,18.01 L565.82,18.01  '/>	<g transform="translate(72.53,21.91)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" > 4000</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M166.82,422.40 L166.82,413.40 M166.82,18.01 L166.82,27.01  '/>	<g transform="translate(166.82,444.30)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="middle">
+		<text><tspan font-family="Arial" > 5</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M274.19,422.40 L274.19,413.40 M274.19,18.01 L274.19,27.01  '/>	<g transform="translate(274.19,444.30)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="middle">
+		<text><tspan font-family="Arial" > 10</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M381.55,422.40 L381.55,413.40 M381.55,18.01 L381.55,27.01  '/>	<g transform="translate(381.55,444.30)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="middle">
+		<text><tspan font-family="Arial" > 15</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M488.92,422.40 L488.92,413.40 M488.92,18.01 L488.92,27.01  '/>	<g transform="translate(488.92,444.30)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="middle">
+		<text><tspan font-family="Arial" > 20</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,18.01 L80.92,422.40 L574.82,422.40 L574.82,18.01 L80.92,18.01 Z  '/></g>
+<g fill="none" color="black" stroke="black" stroke-width="0.50" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='rgb(192, 192, 192)'  d='M317.14,422.39 L317.14,18.00  '/></g>
+<g fill="none" color="black" stroke="rgb(192, 192, 192)" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<g transform="translate(19.18,220.21) rotate(270)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="middle">
+		<text><tspan font-family="Arial" >Runtime [ms]</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<g transform="translate(327.87,471.30)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="middle">
+		<text><tspan font-family="Arial" >Threads</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+	<g id="gnuplot_plot_1" ><title>single-threaded</title>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<g transform="translate(507.09,39.91)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" >single-threaded</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M515.48,36.01 L558.04,36.01 M80.92,244.31 L85.91,244.31 L90.90,244.31 L95.89,244.31 L100.88,244.31 L105.86,244.31
+		L110.85,244.31 L115.84,244.31 L120.83,244.31 L125.82,244.31 L130.81,244.31 L135.80,244.31 L140.79,244.31 L145.78,244.31
+		L150.76,244.31 L155.75,244.31 L160.74,244.31 L165.73,244.31 L170.72,244.31 L175.71,244.31 L180.70,244.31 L185.69,244.31
+		L190.68,244.31 L195.66,244.31 L200.65,244.31 L205.64,244.31 L210.63,244.31 L215.62,244.31 L220.61,244.31 L225.60,244.31
+		L230.59,244.31 L235.58,244.31 L240.56,244.31 L245.55,244.31 L250.54,244.31 L255.53,244.31 L260.52,244.31 L265.51,244.31
+		L270.50,244.31 L275.49,244.31 L280.48,244.31 L285.46,244.31 L290.45,244.31 L295.44,244.31 L300.43,244.31 L305.42,244.31
+		L310.41,244.31 L315.40,244.31 L320.39,244.31 L325.38,244.31 L330.36,244.31 L335.35,244.31 L340.34,244.31 L345.33,244.31
+		L350.32,244.31 L355.31,244.31 L360.30,244.31 L365.29,244.31 L370.28,244.31 L375.26,244.31 L380.25,244.31 L385.24,244.31
+		L390.23,244.31 L395.22,244.31 L400.21,244.31 L405.20,244.31 L410.19,244.31 L415.18,244.31 L420.16,244.31 L425.15,244.31
+		L430.14,244.31 L435.13,244.31 L440.12,244.31 L445.11,244.31 L450.10,244.31 L455.09,244.31 L460.08,244.31 L465.06,244.31
+		L470.05,244.31 L475.04,244.31 L480.03,244.31 L485.02,244.31 L490.01,244.31 L495.00,244.31 L499.99,244.31 L504.98,244.31
+		L509.96,244.31 L514.95,244.31 L519.94,244.31 L524.93,244.31 L529.92,244.31 L534.91,244.31 L539.90,244.31 L544.89,244.31
+		L549.88,244.31 L554.86,244.31 L559.85,244.31 L564.84,244.31 L569.83,244.31 L574.82,244.31  '/></g>
+	</g>
+	<g id="gnuplot_plot_2" ><title>shallow, static</title>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<g transform="translate(507.09,57.91)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" >shallow, static</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='rgb(  0, 158, 115)'  d='M515.48,54.01 L558.04,54.01 M80.92,238.04 L102.39,280.70 L123.87,301.11 L145.34,307.00 L166.82,308.87 L188.29,319.73
+		L209.76,319.63 L231.24,317.82 L252.71,318.43 L274.19,319.10 L295.66,318.31 L317.13,316.33 L338.61,306.27 L360.08,298.40
+		L381.55,310.76 L403.03,282.76 L424.50,296.35 L445.98,295.19 L467.45,284.61 L488.92,299.58 L510.40,286.96 L531.87,283.44
+		L553.35,268.02 L574.82,225.89  '/></g>
+	</g>
+	<g id="gnuplot_plot_3" ><title>shallow, dynamic</title>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<g transform="translate(507.09,75.91)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" >shallow, dynamic</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='rgb( 86, 180, 233)'  d='M515.48,72.01 L558.04,72.01 M80.92,238.33 L102.39,240.34 L123.87,255.30 L145.34,249.62 L166.82,270.13 L188.29,272.24
+		L209.76,263.35 L231.24,268.38 L252.71,271.09 L274.19,270.84 L295.66,275.12 L317.13,272.70 L338.61,258.06 L360.08,264.09
+		L381.55,262.35 L403.03,262.25 L424.50,266.66 L445.98,264.17 L467.45,271.18 L488.92,253.84 L510.40,239.64 L531.87,239.58
+		L553.35,241.09 L574.82,205.46  '/></g>
+	</g>
+	<g id="gnuplot_plot_4" ><title>collapsed, static</title>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<g transform="translate(507.09,93.91)" stroke="none" fill="black" font-family="Arial" font-size="12.00"  text-anchor="end">
+		<text><tspan font-family="Arial" >collapsed, static</tspan></text>
+	</g>
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='rgb(230, 159,   0)'  d='M515.48,90.01 L558.04,90.01 M80.92,19.92 L102.39,177.11 L123.87,227.99 L145.34,256.95 L166.82,268.86 L188.29,279.59
+		L209.76,288.55 L231.24,296.05 L252.71,291.62 L274.19,297.88 L295.66,298.77 L317.13,299.69 L338.61,284.45 L360.08,266.00
+		L381.55,287.56 L403.03,268.35 L424.50,274.12 L445.98,279.34 L467.45,274.48 L488.92,281.54 L510.40,274.67 L531.87,278.10
+		L553.35,262.34 L574.82,193.10  '/></g>
+	</g>
+<g fill="none" color="white" stroke="rgb(230, 159,   0)" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="2.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+<g fill="none" color="black" stroke="black" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+	<path stroke='black'  d='M80.92,18.01 L80.92,422.40 L574.82,422.40 L574.82,18.01 L80.92,18.01 Z  '/></g>
+<g fill="none" color="black" stroke="currentColor" stroke-width="1.00" stroke-linecap="butt" stroke-linejoin="miter">
+</g>
+</g>
+</svg>
+
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-dynamic.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-dynamic.txt
new file mode 100644
index 00000000..5bf5552f
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-dynamic.txt
@@ -0,0 +1,24 @@
+1 12935.7 360.757 12680.6 2
+2 111735 161.192 111621 2
+3 121632 270.072 121441 2
+4 124374 156.605 124180 5
+5 102743 17.7151 102730 2
+6 88622.3 136.157 88526 2
+7 78737.6 232.754 78573 2
+8 70705 135.711 70553.9 4
+9 66842.6 88.6509 66779.9 2
+10 64278.5 154.226 64169.4 2
+11 62228.7 183.461 62099 2
+12 60467.4 59.3568 60425.4 2
+13 58008.2 10.2265 58001 2
+14 54951.5 29.5089 54918.2 4
+15 52639.1 34.7557 52614.5 2
+16 50617.5 129.173 50526.2 2
+17 48262.1 6.0242 48257.9 2
+18 46833.2 28.7177 46812.9 2
+19 45230 12.5256 45221.1 2
+20 43992.6 0.927018 43991.9 2
+21 42174.7 7.70656 42169.2 2
+22 40838.7 21.838 40823.3 2
+23 39799.1 53.1698 39733.3 5
+24 42077 286.367 41874.5 2
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-static.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-static.txt
new file mode 100644
index 00000000..9bc040ab
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-collapsed-static.txt
@@ -0,0 +1,24 @@
+1 3981.12 56.9674 3914.87 38
+2 2426.23 36.3124 2363.07 23
+3 1922.95 58.7132 1856.75 24
+4 1636.56 64.1493 1571.75 51
+5 1518.76 50.4952 1421.02 21
+6 1412.58 70.5845 1275.9 39
+7 1323.95 42.6127 1248.95 42
+8 1249.74 28.4438 1206.39 60
+9 1293.59 38.2212 1208.93 48
+10 1231.64 35.2432 1160.04 30
+11 1222.88 30.9767 1162.71 39
+12 1213.74 25.8987 1175.75 30
+13 1364.55 17.689 1320.49 30
+14 1546.98 8.85813 1535.59 33
+15 1333.81 113.325 1224.58 47
+16 1523.74 27.3059 1460.9 24
+17 1466.7 2.91745 1460.98 33
+18 1415.08 68.6933 1248.24 30
+19 1463.17 15.9095 1447.6 27
+20 1393.29 84.6648 1290.35 50
+21 1461.23 18.6694 1433.07 23
+22 1427.35 27.7095 1398.9 23
+23 1583.25 55.8459 1523.57 24
+24 2268.06 137.569 2103.68 22
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-dynamic.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-dynamic.txt
new file mode 100644
index 00000000..a1a65ce1
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-dynamic.txt
@@ -0,0 +1,24 @@
+1 1820.68 28.8543 1788.09 42
+2 1800.82 45.1005 1719.79 30
+3 1652.87 24.5901 1625.1 38
+4 1709.06 16.931 1677.26 29
+5 1506.15 22.8942 1472.38 50
+6 1485.3 26.5845 1457.99 33
+7 1573.28 61.0053 1468.64 42
+8 1523.51 28.4464 1471.67 21
+9 1496.66 65.6791 1397.31 36
+10 1499.12 49.2789 1399.24 26
+11 1456.79 36.713 1402.69 59
+12 1480.78 26.3377 1452.96 35
+13 1625.52 192.755 1387.4 99
+14 1565.94 176.501 1395.46 63
+15 1583.1 168.616 1386.43 60
+16 1584.13 181.155 1410.02 72
+17 1540.47 166.82 1407.7 66
+18 1565.17 161.84 1422.28 32
+19 1495.8 185.809 1364.82 62
+20 1667.31 184.946 1396.48 23
+21 1807.79 10.7196 1782.79 21
+22 1808.32 10.3299 1795.53 28
+23 1793.4 18.6433 1767.01 30
+24 2145.88 70.3782 2041.5 33
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-static.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-static.txt
new file mode 100644
index 00000000..c56251df
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-shallow-static.txt
@@ -0,0 +1,24 @@
+1 1823.55 26.2263 1782.73 23
+2 1401.58 19.8054 1349.46 96
+3 1199.78 20.8571 1167.98 30
+4 1141.46 19.449 1111.81 24
+5 1122.95 22.5619 1092.07 38
+6 1015.55 47.5705 972.33 47
+7 1016.55 13.0833 995.12 24
+8 1034.47 26.4825 953.92 32
+9 1028.39 73.6957 949.22 66
+10 1021.82 30.7757 985.06 42
+11 1029.58 23.5031 996.72 54
+12 1049.21 28.4955 1012.32 27
+13 1148.73 32.048 1099.74 33
+14 1226.55 116.99 1060.86 42
+15 1104.32 64.5803 1048.97 48
+16 1381.24 11.6949 1363.26 36
+17 1246.86 125.312 1062.78 57
+18 1258.27 123.22 1072.89 39
+19 1362.96 19.4314 1333.75 33
+20 1214.86 133.691 1055.18 44
+21 1339.66 28.7203 1311.06 48
+22 1374.55 57.2419 1347.98 33
+23 1527.07 139.481 1365.03 37
+24 1943.73 351.194 1581.44 36
diff --git a/projects/2021/groupK_burgers_openmp/branch-omp/results-single.txt b/projects/2021/groupK_burgers_openmp/branch-omp/results-single.txt
new file mode 100644
index 00000000..7771829d
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-omp/results-single.txt
@@ -0,0 +1 @@
+1 1761.55 15.7786 1734.09 227
diff --git a/projects/2021/groupK_burgers_openmp/branch-upcxx/main.cpp b/projects/2021/groupK_burgers_openmp/branch-upcxx/main.cpp
new file mode 100644
index 00000000..5c7e07ce
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-upcxx/main.cpp
@@ -0,0 +1,725 @@
+#include <Eigen/Dense>
+#include <upcxx/upcxx.hpp>
+
+#include <concepts>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <numbers>
+#include <type_traits>
+#include <limits>
+#include <algorithm>
+#include <utility>
+#include <memory>
+#include <chrono>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+#include <vector>
+#include <sstream>
+
+using Scalar = double;
+using TemplateParamScalar =
+#if __cpp_nontype_template_args >= 201911
+    Scalar
+#else
+    long int
+#endif
+;
+// using Vector3 = Eigen::Matrix<Scalar, 3, 1>;
+using Vector = Eigen::Matrix<Scalar, Eigen::Dynamic, 1>;
+using Vector2 = Eigen::Matrix<Scalar, 2, 1>;
+using Matrix = Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+template<typename T>
+concept Tensoroid = true; // CBA
+template<typename T>
+concept Matrixoid = requires(T const& m) {
+    { m(0, 0) } -> std::convertible_to<Scalar>;
+    { m.rows() } -> std::integral;
+    { m.cols() } -> std::integral;
+    // more stuff but CBA
+};
+template<typename T>
+concept VectorField = requires(T const& m) {
+    { m[0](0, 0) } -> std::convertible_to<Scalar>;
+    { m[0] } -> Matrixoid;
+    { m.rows() } -> std::integral;
+    { m.cols() } -> std::integral;
+    // more stuff but CBA
+};
+template<typename T>
+concept Vectoroid = requires(T const& v) {
+    { v[0] } -> std::convertible_to<Scalar>;
+    // more stuff but CBA
+};
+
+namespace Detail {
+    template<typename T>
+    struct IsEigenSequence : std::false_type {};
+    template<typename FirstType, typename SizeType, typename IncrType>
+    struct IsEigenSequence<Eigen::ArithmeticSequence<FirstType, SizeType, IncrType>> : std::true_type {};
+}
+template<typename T>
+concept Sequenceoid = Detail::IsEigenSequence<T>::value;
+
+template<typename T>
+[[nodiscard]] constexpr T sq(T const& x) {
+    return x * x;
+}
+[[nodiscard]] constexpr auto absi(std::signed_integral auto x) noexcept {
+    assert(x != std::numeric_limits<decltype(x)>::min());
+    return x < 0 ? -x : x;
+}
+[[nodiscard]] constexpr auto absi(std::unsigned_integral auto x) noexcept {
+    return x;
+}
+[[nodiscard]] constexpr unsigned exp2i(unsigned x) noexcept {
+    assert(x < std::numeric_limits<unsigned>::digits);
+    return 1<<x;
+}
+inline constexpr Scalar pi = std::numbers::pi_v<Scalar>;
+
+template<std::invocable Invocable>
+struct ScopeGuard {
+    Invocable invocable;
+    [[nodiscard]] explicit constexpr ScopeGuard(Invocable&& invocable) noexcept(std::is_nothrow_move_constructible_v<Invocable>) : invocable(std::move(invocable)) {}
+    [[nodiscard]] explicit constexpr ScopeGuard(Invocable const& invocable) noexcept(std::is_nothrow_copy_constructible_v<Invocable>) : invocable(invocable) {}
+    constexpr ~ScopeGuard() noexcept(std::is_nothrow_invocable_v<Invocable>) { invocable(); }
+};
+
+template<Tensoroid Tensor, unsigned ranks, unsigned slicedRank, std::integral IndexType = int>
+struct Rank1Slice {
+    Tensor* tensor;
+    std::array<IndexType, ranks> position;
+
+    [[nodiscard]] constexpr Rank1Slice(Tensor& tensor, std::convertible_to<IndexType> auto const&... indices)
+    : tensor(std::addressof(tensor)), position{ indices... } {
+        static_assert(sizeof...(indices) == ranks);
+    }
+
+    // not multi-thread friendly
+    [[nodiscard]] constexpr decltype(auto) operator[](IndexType i) const {
+        assert(tensor);
+        const_cast<Rank1Slice*>(this)->position[slicedRank] += i;
+        const ScopeGuard restore([this, i]{ const_cast<Rank1Slice*>(this)->position[slicedRank] -= i; });
+        return std::apply(*tensor, position);
+    }
+};
+template<unsigned slicedRank>
+[[nodiscard]] constexpr auto rank1Slice(Tensoroid auto& tensor, std::integral auto const&... indices) {
+    return Rank1Slice<std::remove_reference_t<decltype(tensor)>, sizeof...(indices), slicedRank, std::common_type_t<std::remove_cvref_t<decltype(indices)>...>>(tensor, indices...);
+}
+
+template<template<typename, typename> typename F, typename I, typename... Args>
+struct FoldR;
+template<template<typename, typename> typename F, typename I, typename T>
+struct FoldR<F, I, T> : std::type_identity<typename F<T, I>::type> {};
+template<template<typename, typename> typename F, typename I, typename T, typename U, typename... Tail>
+struct FoldR<F, I, T, U, Tail...> : std::type_identity<typename F<T, typename FoldR<F, I, U, Tail...>::type>::type> {};
+
+template<std::integral auto index_, TemplateParamScalar factor_>
+struct IndexCoeffPair {
+    static constexpr auto index = index_;
+    static constexpr auto factor = factor_;
+};
+template<typename T>
+struct IsIndexCoeffPair : std::false_type {};
+template<std::integral auto index, TemplateParamScalar factor>
+struct IsIndexCoeffPair<IndexCoeffPair<index, factor>> : std::true_type {};
+template<typename T>
+concept IndexCoeff = IsIndexCoeffPair<T>::value;
+
+template<typename Lhs, typename Rhs>
+struct IndexMin : std::type_identity<std::conditional_t<(Lhs::index < Rhs::index), Lhs, Rhs>> {};
+template<typename Lhs, typename Rhs>
+struct IndexMax : std::type_identity<std::conditional_t<(Lhs::index > Rhs::index), Lhs, Rhs>> {};
+
+enum class StencilType { asymmetric, symmetric, antisymmetric };
+template<StencilType type, IndexCoeff... Pairs>
+struct Stencil /* 1D only */ {
+    static constexpr auto min_index = FoldR<IndexMin, IndexCoeffPair<std::numeric_limits<int>::max(), {}>, Pairs...>::type::index;
+    static constexpr auto max_index = FoldR<IndexMax, IndexCoeffPair<std::numeric_limits<int>::min(), {}>, Pairs...>::type::index;
+
+    static constexpr auto left   = type == StencilType::asymmetric ? min_index : -std::max(absi(min_index), max_index);
+    static constexpr auto right  = type == StencilType::asymmetric ? max_index :  std::max(absi(min_index), max_index);
+    static constexpr auto margin = sizeof...(Pairs) > 0 ? std::max(absi(left), right) : 0;
+
+    [[nodiscard]] static constexpr auto apply(Vectoroid auto&& vector) {
+        switch(type) {
+        case StencilType::asymmetric:
+            return ((Pairs::factor * vector[Pairs::index]) + ...);
+        case StencilType::symmetric:
+            return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] + vector[-Pairs::index] : vector[Pairs::index])) + ...);
+        case StencilType::antisymmetric:
+            return ((Pairs::factor * (Pairs::index ? vector[Pairs::index] - vector[-Pairs::index] : vector[Pairs::index])) + ...);
+        };
+    }
+};
+
+using AdvectionStencil1 = Stencil<StencilType::antisymmetric,
+    IndexCoeffPair<1, TemplateParamScalar( 45)>,
+    IndexCoeffPair<2, TemplateParamScalar(- 9)>,
+    IndexCoeffPair<3, TemplateParamScalar(  1)>>;
+using AdvectionStencil2 = Stencil<StencilType::symmetric,
+    IndexCoeffPair<0, TemplateParamScalar(-20)>,
+    IndexCoeffPair<1, TemplateParamScalar( 15)>,
+    IndexCoeffPair<2, TemplateParamScalar(- 6)>,
+    IndexCoeffPair<3, TemplateParamScalar(  1)>>;
+
+using DiffusionStencil = Stencil<StencilType::symmetric,
+    IndexCoeffPair<0, TemplateParamScalar(-30)>,
+    IndexCoeffPair<1, TemplateParamScalar( 16)>,
+    IndexCoeffPair<2, TemplateParamScalar(- 1)>>;
+
+constexpr int margin = std::max({ AdvectionStencil1::margin, AdvectionStencil2::margin, DiffusionStencil::margin });
+
+enum UseCase { zhao, hopf_cole };
+
+class VectorField2 {
+    Matrix uv[2];
+
+public:
+    [[nodiscard]] VectorField2() = default;
+    [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols)
+    : uv{ Matrix(rows, cols), Matrix(rows, cols) } {
+    }
+    [[nodiscard]] explicit VectorField2(Eigen::Index rows, Eigen::Index cols, Scalar fill)
+    : uv{ Matrix::Constant(rows, cols, fill), Matrix::Constant(rows, cols, fill) } {
+    }
+
+    [[nodiscard]] constexpr Matrix const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; }
+    [[nodiscard]] constexpr Matrix      & operator[](int i)       noexcept { assert(0 <= i && i < 2); return uv[i]; }
+
+    [[nodiscard]] constexpr Eigen::Index rows() const noexcept { return uv[0].rows(); }
+    [[nodiscard]] constexpr Eigen::Index cols() const noexcept { return uv[0].cols(); }
+};
+
+[[nodiscard]] constexpr VectorField2 solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y, Sequenceoid auto slice_x, Sequenceoid auto slice_y) {
+    const int mi = slice_x.size() ? slice_x[slice_x.size() - 1] - slice_x[0] + 1 : 0;
+    const int mj = slice_y.size() ? slice_y[slice_y.size() - 1] - slice_y[0] + 1 : 0;
+
+    const Matrixoid auto x2d = x(slice_x).transpose().replicate(mj, 1);
+    const Matrixoid auto y2d = y(slice_y).replicate(1, mi);
+
+    const int nx = x2d.cols(), ny = x2d.rows();
+    assert(nx == slice_x.size());
+    assert(y2d.cols() == nx && y2d.rows() == ny);
+
+    using std::exp;
+    VectorField2 result;
+    switch(useCase) {
+    case zhao:
+        result[0] = -4 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().cos() * (pi * y2d).array().sin() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin());
+        result[1] = -2 * mu * pi * exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().cos() / (2 + exp(-5 * sq(pi) * mu * t) * (2 * pi * x2d).array().sin() * (pi * y2d).array().sin());
+        break;
+    case hopf_cole:
+        result[0] = 0.75 - 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu)));
+        result[1] = 0.75 + 1 / (4 * (1 + (-Matrix::Constant(x2d.rows(), x2d.cols(), t) - 4 * x2d + 4 * y2d).array().exp() / (32 * mu)));
+        break;
+    default:
+        assert(false);
+    }
+    return result;
+}
+[[nodiscard]] constexpr decltype(auto) solution_factory(UseCase useCase, Scalar mu, Scalar t, Matrixoid auto const& x, Matrixoid auto const& y) {
+    return solution_factory(useCase, mu, t, x, y, Eigen::seq(0, x.size() - 1), Eigen::seq(0, y.size() - 1));
+}
+
+[[nodiscard]] constexpr VectorField2 initial_solution(UseCase useCase, Scalar mu, Matrixoid auto const& x, Matrixoid auto const& y) {
+    return solution_factory(useCase, mu, 0, x, y);
+}
+
+Vector padVector(Vectoroid auto input, int targetLength, Scalar paddingValue){
+    assert(targetLength >= input.rows());
+    Vector padded(targetLength);
+    padded << input, paddingValue * Vector::Ones(targetLength - input.rows());
+    return padded;
+}
+
+struct DistributedConfig {
+
+    explicit DistributedConfig(int32_t globalWidth, int32_t globalHeight, int32_t ranksWidth, int32_t ranksHeight, int32_t rankIndex, int32_t margin) :
+        RANKS_TOTAL(ranksWidth * ranksHeight),
+        RANKS_WIDTH(ranksWidth),
+        RANKS_HEIGHT(ranksHeight),
+        RANK_INDEX(rankIndex),
+        RANK_X(rankIndex % ranksWidth),
+        RANK_Y(rankIndex / ranksWidth),
+        GLOBAL_WIDTH(globalWidth),
+        GLOBAL_HEIGHT(globalHeight),
+        LOCAL_WIDTH((globalWidth + ranksWidth - 1) / ranksWidth),
+        LOCAL_HEIGHT((globalHeight + ranksHeight - 1) / ranksHeight),
+        STORED_WIDTH(LOCAL_WIDTH + 2 * margin),
+        STORED_HEIGHT(LOCAL_HEIGHT + 2 * margin),
+        MARGIN(margin),
+        DX(1.0 / (globalWidth - 1)),
+        DY(1.0 / (globalHeight - 1)),
+        GLOBAL_X(Vector::LinSpaced(globalWidth, 0, 1)),
+        GLOBAL_Y(Vector::LinSpaced(globalHeight, 0, 1)),
+        LOCAL_X(padVector(GLOBAL_X, LOCAL_WIDTH * RANKS_WIDTH, 1.0).segment(RANK_X * LOCAL_WIDTH, LOCAL_WIDTH)),
+        LOCAL_Y(padVector(GLOBAL_Y, LOCAL_HEIGHT * RANKS_HEIGHT, 1.0).segment(RANK_Y * LOCAL_HEIGHT, LOCAL_HEIGHT))
+        {
+            USED_WIDTH = LOCAL_WIDTH;
+            if (RANK_X == RANKS_WIDTH - 1) {
+                USED_WIDTH = GLOBAL_WIDTH - RANK_X * LOCAL_WIDTH;
+            }
+            USED_HEIGHT = LOCAL_HEIGHT;
+            if (RANK_Y == RANKS_HEIGHT - 1) {
+                USED_HEIGHT = GLOBAL_HEIGHT - RANK_Y * LOCAL_HEIGHT;
+            }
+            if (USED_WIDTH < MARGIN || USED_HEIGHT < MARGIN) {
+                std::cerr << "Invalid segmentation " << USED_WIDTH << "x" << USED_HEIGHT << "\n";
+                std::abort();
+            }
+        }
+
+    // Number of rank matrix blocks
+    int32_t RANKS_TOTAL;
+
+    // Number of rank matrix blocks per global matrix width
+    int32_t RANKS_WIDTH;
+
+    // Number of rank matrix blocks per global matrix height
+    int32_t RANKS_HEIGHT;
+
+    // Index of local rank
+    int32_t RANK_INDEX;
+
+    // X coordinate of local rank's matrix block
+    int32_t RANK_X;
+
+    // Y coordinate of local rank's matrix block
+    int32_t RANK_Y;
+
+    // Width of entire matrix that is being distributed
+    int32_t GLOBAL_WIDTH;
+
+    // Height of entire matrix that is being distributed
+    int32_t GLOBAL_HEIGHT;
+
+    // Width of matrix block that is local to this rank
+    int32_t LOCAL_WIDTH;
+
+    // Height of matrix block that is local to this rank
+    int32_t LOCAL_HEIGHT;
+
+    // Width of matrix block that is used on this rank, i.e. without the extra cells on the right border
+    int32_t USED_WIDTH;
+
+    // Height of matrix block that is used on this rank, i.e. without the extra cells on the bottom border
+    int32_t USED_HEIGHT;
+
+    // Width of matrix that is local to this rank including margins
+    int32_t STORED_WIDTH;
+
+    // Height of matrix that is local to this rank including margins
+    int32_t STORED_HEIGHT;
+
+    // Margin around each local matrix for communication
+    int32_t MARGIN;
+
+    // X coordinate spacing
+    Scalar DX;
+
+    // Y coordinate spacing
+    Scalar DY;
+
+    // X coordinates of entire global matrix
+    Vector GLOBAL_X;
+
+    // Y coordinates of entire global matrix
+    Vector GLOBAL_Y;
+
+    // X coordinates of local matrix block
+    Vector LOCAL_X;
+
+    // Y coordinates of local matrix block
+    Vector LOCAL_Y;
+
+};
+
+class DistributedVectorField2 {
+    DistributedConfig cfg;
+    std::reference_wrapper<std::vector<std::array<upcxx::global_ptr<Scalar>, 2>>> allMatrices;
+    std::vector<Eigen::Map<Matrix, Eigen::Aligned128>> uv;
+
+public:
+    [[nodiscard]] explicit DistributedVectorField2(const VectorField2& o, std::vector<std::array<upcxx::global_ptr<Scalar>, 2>>& allMatrices, DistributedConfig config) :
+        cfg(std::move(config)),
+        allMatrices(allMatrices) {
+
+        for (Eigen::Index dim = 0; dim < 2; dim++) {
+            assert(cfg.GLOBAL_WIDTH == o[dim].cols() && cfg.GLOBAL_HEIGHT == o[dim].rows());
+            allMatrices[cfg.RANK_INDEX][dim] = upcxx::allocate<Scalar>(cfg.STORED_WIDTH * cfg.STORED_HEIGHT, 256);
+            uv.emplace_back(Eigen::Map<Matrix, Eigen::Aligned128>(allMatrices[cfg.RANK_INDEX][dim].local(), cfg.STORED_HEIGHT, cfg.STORED_WIDTH));
+            for (Eigen::Index y = cfg.LOCAL_HEIGHT * cfg.RANK_Y; y < cfg.GLOBAL_HEIGHT && y < cfg.LOCAL_HEIGHT * (cfg.RANK_Y + 1); ++y) {
+                std::copy(o[dim].data() + y * cfg.GLOBAL_WIDTH + cfg.RANK_X * cfg.LOCAL_WIDTH,
+                          o[dim].data() + y * cfg.GLOBAL_WIDTH + std::min(cfg.GLOBAL_WIDTH, (cfg.RANK_X + 1) * cfg.LOCAL_WIDTH),
+                          uv[dim].data() + cfg.MARGIN + (y - cfg.RANK_Y * cfg.LOCAL_HEIGHT + cfg.MARGIN) * cfg.STORED_WIDTH);
+            }
+        }
+    }
+
+    /**
+     * @return if x, y are in bounds
+     */
+    [[nodiscard]] bool inBounds(Eigen::Index x, Eigen::Index y) {
+        if (x < 0) return false;
+        if (x >= cfg.RANKS_WIDTH) return false;
+        if (y < 0) return false;
+        if (y >= cfg.RANKS_HEIGHT) return false;
+        return true;
+    }
+
+    /**
+     * Synchronize borders with a neighbour given an offset
+     * @return future for synchronization
+     */
+    [[nodiscard]] upcxx::future<> synchronizeRight(UseCase useCase, Scalar mu, Scalar t) {
+        Eigen::Index otherX = cfg.RANK_X + 1;
+        Eigen::Index otherY = cfg.RANK_Y;
+        upcxx::future<> fut = upcxx::make_future();
+
+        if (inBounds(otherX, otherY)) {
+            for (Eigen::Index dim = 0; dim < 2; dim++) {
+                upcxx::global_ptr<Scalar> otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.STORED_WIDTH * cfg.MARGIN + cfg.MARGIN;
+                auto localStart = uv[dim].data() + cfg.USED_WIDTH + cfg.MARGIN + cfg.STORED_WIDTH * cfg.MARGIN;
+                fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}},
+                                                                  localStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.MARGIN, cfg.USED_HEIGHT}}));
+            }
+        } else {
+
+            auto slice_x = Eigen::seq(cfg.USED_WIDTH - margin, cfg.USED_WIDTH - 1);
+            auto slice_y = Eigen::seq(0, cfg.USED_HEIGHT - 1);
+            VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y);
+            uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]);
+            uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]);
+        }
+        return fut;
+    }
+
+    [[nodiscard]] upcxx::future<> synchronizeLeft(UseCase useCase, Scalar mu, Scalar t) {
+        Eigen::Index otherX = cfg.RANK_X - 1;
+        Eigen::Index otherY = cfg.RANK_Y;
+        upcxx::future<> fut = upcxx::make_future();
+
+        if (inBounds(otherX, otherY)) {
+
+            for (Eigen::Index dim = 0; dim < 2; dim++) {
+                upcxx::global_ptr<Scalar> otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.LOCAL_WIDTH + cfg.STORED_WIDTH * cfg.MARGIN;
+                auto localStart = uv[dim].data() + cfg.STORED_WIDTH * cfg.MARGIN;
+                fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}},
+                                                                  localStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.MARGIN, cfg.USED_HEIGHT}}));
+            }
+        } else {
+
+            auto slice_x = Eigen::seq(0, cfg.MARGIN - 1);
+            auto slice_y = Eigen::seq(0, cfg.USED_HEIGHT - 1);
+            VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y);
+            uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]);
+            uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]);
+        }
+        return fut;
+    }
+
+    [[nodiscard]] upcxx::future<> synchronizeUp(UseCase useCase, Scalar mu, Scalar t) {
+        Eigen::Index otherX = cfg.RANK_X;
+        Eigen::Index otherY = cfg.RANK_Y - 1;
+        upcxx::future<> fut = upcxx::make_future();
+        if (inBounds(otherX, otherY)) {
+
+            for (Eigen::Index dim = 0; dim < 2; dim++) {
+                upcxx::global_ptr<Scalar> otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.MARGIN + cfg.STORED_WIDTH * cfg.LOCAL_HEIGHT;
+                auto localStart = uv[dim].data() + cfg.MARGIN;
+                fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}},
+                                                                  localStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.USED_WIDTH, cfg.MARGIN}}));
+            }
+        } else {
+
+            auto slice_x = Eigen::seq(0, cfg.USED_WIDTH - 1);
+            auto slice_y = Eigen::seq(0, cfg.MARGIN - 1);
+            VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y);
+            uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]);
+            uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]);
+        }
+        return fut;
+    }
+
+    [[nodiscard]] upcxx::future<> synchronizeDown(UseCase useCase, Scalar mu, Scalar t) {
+        Eigen::Index otherX = cfg.RANK_X;
+        Eigen::Index otherY = cfg.RANK_Y + 1;
+        upcxx::future<> fut = upcxx::make_future();
+        if (inBounds(otherX, otherY)) {
+
+            for (Eigen::Index dim = 0; dim < 2; dim++) {
+                upcxx::global_ptr<Scalar> otherStart = allMatrices.get()[otherY * cfg.RANKS_WIDTH + otherX][dim] + cfg.MARGIN + cfg.MARGIN * cfg.STORED_WIDTH;
+                auto localStart = uv[dim].data() + cfg.MARGIN + cfg.STORED_WIDTH * (cfg.MARGIN + cfg.USED_HEIGHT);
+                fut = upcxx::when_all(fut, upcxx::rget_strided<2>(otherStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}},
+                                                                  localStart,
+                                                                  {{sizeof(Scalar), sizeof(Scalar) * cfg.STORED_WIDTH}}, {{cfg.USED_WIDTH, cfg.MARGIN}}));
+            }
+        } else {
+
+            auto slice_x = Eigen::seq(0, cfg.USED_WIDTH - 1);
+            auto slice_y = Eigen::seq(cfg.USED_HEIGHT - cfg.MARGIN, cfg.USED_HEIGHT - 1);
+            VectorField2 uv_ex = solution_factory(useCase, mu, t, cfg.LOCAL_X, cfg.LOCAL_Y, slice_x, slice_y);
+            uv[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[0]);
+            uv[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH)(slice_y, slice_x) = std::move(uv_ex[1]);
+        }
+        return fut;
+    }
+
+    [[nodiscard]] Eigen::Map<Matrix, Eigen::Aligned128> const& operator[](int i) const noexcept { assert(0 <= i && i < 2); return uv[i]; }
+    [[nodiscard]] Eigen::Map<Matrix, Eigen::Aligned128>      & operator[](int i)       noexcept { assert(0 <= i && i < 2); return uv[i]; }
+
+    [[nodiscard]] Eigen::Index rows() const noexcept { return uv[0].rows(); }
+    [[nodiscard]] Eigen::Index cols() const noexcept { return uv[0].cols(); }
+};
+
+[[nodiscard]] constexpr Scalar advection(Scalar d, Scalar w, Vectoroid auto const& phiSlice) {
+    return
+        (           w  * AdvectionStencil1::apply(phiSlice)
+         - std::abs(w) * AdvectionStencil2::apply(phiSlice)) / (60 * d);
+}
+template<std::integral auto advectRank>
+void advection(Vectoroid auto const& d, Matrixoid auto const& w, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable<Scalar&, Scalar> auto assignment) {
+    const int nx = w.cols(), ny = w.rows();
+    assert(phi.cols() == nx && phi.rows() == ny);
+    assert(out.cols() == nx && out.rows() == ny);
+    for(int x = margin; x < nx - margin; ++x)
+        for(int y = margin; y < ny - margin; ++y)
+            assignment(out(y, x), advection(d[advectRank], w(y, x), rank1Slice<advectRank>(phi, y, x)));
+}
+void advection(Vectoroid auto const& d, VectorField auto const& uv, VectorField auto& adv) {
+    advection<1>(d, uv[0], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+    advection<0>(d, uv[1], uv[0], adv[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+
+    advection<1>(d, uv[0], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+    advection<0>(d, uv[1], uv[1], adv[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+}
+
+[[nodiscard]] constexpr Scalar diffusion(Scalar d, Vectoroid auto const& phiSlice) {
+    return 1 / (12 * sq(d)) * DiffusionStencil::apply(phiSlice);
+}
+template<std::integral auto diffuseRank>
+void diffusion(Vectoroid auto const& d, Matrixoid auto const& phi, Matrixoid auto& out, std::invocable<Scalar&, Scalar> auto assignment) {
+    const int nx = phi.cols(), ny = phi.rows();
+    assert(out.cols() == nx && out.rows() == ny);
+    for(int x = margin; x < nx - margin; ++x)
+        for(int y = margin; y < ny - margin; ++y)
+            assignment(out(y, x), diffusion(d[diffuseRank], rank1Slice<diffuseRank>(phi, y, x)));
+}
+void diffusion(Vectoroid auto const& d, VectorField auto const& uv, VectorField auto& diff) {
+    diffusion<1>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+    diffusion<0>(d, uv[0], diff[0], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+
+    diffusion<1>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs  = rhs; });
+    diffusion<0>(d, uv[1], diff[1], [](Scalar& lhs, Scalar rhs) constexpr noexcept { lhs += rhs; });
+}
+
+void rk_stage(
+    Scalar mu,
+    VectorField auto const& in_now,
+    VectorField auto const& in_tmp,
+    VectorField auto& out,
+    VectorField auto& adv,
+    VectorField auto& diff,
+    Scalar dt,
+    Vectoroid auto const& d
+) {
+    advection(d, in_tmp, adv);
+    diffusion(d, in_tmp, diff);
+    out[0] = in_now[0] + dt * (-adv[0] + mu * diff[0]);
+    out[1] = in_now[1] + dt * (-adv[1] + mu * diff[1]);
+}
+
+
+// TODO: only a makeshift because my stdlib doesn't implement this
+template<typename CharT, typename Traits, typename Rep, typename Period>
+std::basic_ostream<CharT, Traits>& operator<<(std::basic_ostream<CharT, Traits>& os, std::chrono::duration<Rep, Period> const& d) {
+    std::basic_ostringstream<CharT, Traits> s;
+    s.flags(os.flags());
+    s.imbue(os.getloc());
+    s.precision(os.precision());
+    s << d.count();
+         if constexpr(std::is_same_v<typename Period::type, std::nano    >) s << "ns";
+    else if constexpr(std::is_same_v<typename Period::type, std::micro   >) s << "µs";
+    else if constexpr(std::is_same_v<typename Period::type, std::milli   >) s << "ms";
+    else if constexpr(std::is_same_v<typename Period::type, std::ratio<1>>) s << "s";
+    else {
+        s << '[' << Period::type::num;
+        if constexpr(Period::type::den != 1)
+            s << '/' << Period::type::den;
+        s << ']';
+    }
+    return os << s.str();
+}
+
+template<typename T>
+struct WelfordVariance {
+    std::size_t n = 0;
+    T mean{};
+    T m{};
+
+    constexpr void update(T const& x) {
+        ++n;
+        const T old_mean = mean;
+        mean = ((n - 1) * old_mean + x) / n;
+        m += (x - old_mean) * (x - mean);
+    }
+
+    [[nodiscard]] constexpr T variance() { return m / n; }
+    [[nodiscard]] constexpr T sample_variance() { return m / (n - 1); }
+};
+
+struct BenchmarkReport {
+    using Duration = std::chrono::duration<double, std::milli>;
+
+    Duration best;
+    Duration average;
+    Duration variance;
+    unsigned runs;
+};
+std::ostream& operator<<(std::ostream& stream, BenchmarkReport const& report) {
+    stream << "runtime: " << report.average << " ± " << std::sqrt(report.variance.count()) << "ms" << "; best: " << report.best << "; n=" << report.runs << '\n';
+    return stream;
+}
+
+template<std::invocable Invocable, typename Clock>
+[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak, Clock clock) {
+    assert(streak > 0);
+    BenchmarkReport result;
+    WelfordVariance<BenchmarkReport::Duration::rep> variance;
+    result.best = BenchmarkReport::Duration(std::numeric_limits<double>::max());
+    for(unsigned current_streak = 0; current_streak < streak; ) {
+        upcxx::barrier();
+        const auto start = clock.now();
+        invocable();
+        const auto end = clock.now();
+        auto duration = std::chrono::duration_cast<BenchmarkReport::Duration>(end - start);
+        upcxx::barrier();
+        duration = upcxx::broadcast(duration, 0).wait();
+        if(duration < result.best) {
+            result.best = duration;
+            current_streak = 0;
+        } else {
+            ++current_streak;
+        }
+        variance.update(duration.count());
+    }
+    result.average = BenchmarkReport::Duration(variance.mean);
+    result.variance = BenchmarkReport::Duration(variance.sample_variance());
+    result.runs = variance.n;
+    return result;
+}
+template<std::invocable Invocable>
+[[nodiscard]] BenchmarkReport benchmark(Invocable&& invocable, unsigned streak = 20) {
+    if constexpr(std::chrono::high_resolution_clock::is_steady)
+        return benchmark(std::forward<Invocable>(invocable), streak, std::chrono::high_resolution_clock{});
+    else
+        return benchmark(std::forward<Invocable>(invocable), streak, std::chrono::steady_clock{});
+}
+
+int main(int argc, char **argv) {
+
+    if (argc != 5) {
+        std::cerr << "Usage: " << argv[0] << " <grid factor> <ranks width> <ranks height> <num iterations multiplier>" << std::endl;
+        return 2;
+    }
+    // use case
+    constexpr UseCase useCase = zhao;
+
+    // diffusion coefficient
+    const Scalar mu = 0.1;
+
+    // grid
+    const int factor = std::stol(argv[1]);
+    const uint32_t coreWidth = std::stol(argv[2]);
+    const uint32_t coreHeight = std::stol(argv[3]);
+    upcxx::init();
+    const uint32_t coreIndex = upcxx::rank_me();
+    const DistributedConfig cfg(factor, factor, coreWidth, coreHeight, coreIndex, margin);
+
+    if (coreWidth * coreHeight != upcxx::rank_n()) {
+        std::cerr << "Using invalid number of ranks\n";
+        return 1;
+    }
+
+    // time
+    const Scalar cfl = 1;
+    const Scalar timestep = cfl / sq(cfg.GLOBAL_WIDTH-1);
+    const Scalar n_iter = sq(cfg.GLOBAL_WIDTH-1) * std::stol(argv[4]) / 1000;
+    // output
+    const int print_period = 0;
+    // vector fields
+
+    const auto syncMatrices = [](std::vector<std::array<upcxx::global_ptr<Scalar>, 2>>& mats) {
+        for (uint32_t i = 0; i < mats.size(); i++) {
+            mats[i] = upcxx::broadcast(mats[i], i).wait();
+        }
+    };
+
+    const auto single_threaded_task = [&]{
+
+        std::vector<std::array<upcxx::global_ptr<Scalar>, 2>> allMatricesNow(cfg.RANKS_TOTAL);
+        DistributedVectorField2 uv_now(VectorField2(cfg.GLOBAL_HEIGHT, cfg.GLOBAL_WIDTH, 0), allMatricesNow, cfg);
+        syncMatrices(allMatricesNow);
+
+        std::vector<std::array<upcxx::global_ptr<Scalar>, 2>> allMatricesNew(cfg.RANKS_TOTAL);
+        DistributedVectorField2 uv_new(initial_solution(useCase, mu, cfg.GLOBAL_X, cfg.GLOBAL_Y), allMatricesNew, cfg);
+        syncMatrices(allMatricesNew);
+
+        upcxx::barrier();
+        upcxx::when_all(uv_new.synchronizeRight(useCase, mu, 0), uv_new.synchronizeLeft(useCase, mu, 0), uv_new.synchronizeUp(useCase, mu, 0), uv_new.synchronizeDown(useCase, mu, 0)).wait();
+        upcxx::barrier();
+
+        VectorField2 adv(uv_now[0].rows(), uv_now[0].cols(), 0);
+        VectorField2 diff(uv_now[0].rows(), uv_now[0].cols(), 0);
+
+        const auto rk_fractions = { 1.0 / 3, 1.0 / 2, 1.0 };
+        Scalar t = 0;
+
+        for(int i = 0; i < n_iter; ++i) {
+            uv_now = uv_new;
+
+            for(const double rk_fraction : rk_fractions) {
+                const Scalar dt = rk_fraction * timestep;
+                rk_stage(mu, uv_now, uv_new, uv_new, adv, diff, dt, Vector2{ cfg.DX, cfg.DY });
+                upcxx::barrier();
+                upcxx::when_all(uv_new.synchronizeRight(useCase, mu, t + dt), uv_new.synchronizeLeft(useCase, mu, t + dt), uv_new.synchronizeUp(useCase, mu, t + dt), uv_new.synchronizeDown(useCase, mu, t + dt)).wait();
+                upcxx::barrier();
+            }
+
+            t += timestep;
+            if(print_period > 0 && ((i+1) % print_period == 0 || i+1 == n_iter)) {
+                VectorField2 uv_ex  = solution_factory(useCase, mu, t, cfg.LOCAL_X.segment(0, cfg.USED_WIDTH), cfg.LOCAL_Y.segment(0, cfg.USED_HEIGHT));
+                const Scalar local_err_u = (uv_new[0].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH) - uv_ex[0]).squaredNorm() * cfg.DX * cfg.DY;
+                const Scalar local_err_v = (uv_new[1].block(cfg.MARGIN, cfg.MARGIN, cfg.USED_HEIGHT, cfg.USED_WIDTH) - uv_ex[1]).squaredNorm() * cfg.DX * cfg.DY;
+                const Scalar err_u = std::sqrt(upcxx::reduce_all(local_err_u, std::plus<Scalar>()).wait());
+                const Scalar err_v = std::sqrt(upcxx::reduce_all(local_err_v, std::plus<Scalar>()).wait());
+                upcxx::barrier();
+                if (cfg.RANK_INDEX == 0) {
+                    std::cout << "Iteration " << std::right << std::setfill(' ') << std::setw(6) << i + 1;
+                    std::cout << std::scientific << std::setprecision(4);
+                    std::cout << ": ||u - uex|| = " << err_u << " m/s, ||v - vex|| = " << err_v << " m/s\n";
+                    std::cout << std::defaultfloat;
+                }
+            }
+        }
+    };
+    if (upcxx::rank_me() == 0) {
+        std::cout << "Time: " << benchmark(single_threaded_task);
+    } else {
+        benchmark(single_threaded_task);
+    }
+
+    upcxx::finalize();
+}
diff --git a/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_small_grid b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_small_grid
new file mode 100644
index 00000000..bd7c891b
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_small_grid
@@ -0,0 +1,10 @@
+Time: runtime: 5799.27ms ± 75.6701ms; best: 5678.92ms; n=33
+Time: runtime: 2074.05ms ± 46.3968ms; best: 1990.05ms; n=47
+Time: runtime: 1692.26ms ± 48.7114ms; best: 1609.17ms; n=43
+Time: runtime: 1392.18ms ± 57.8365ms; best: 1301.65ms; n=33
+Time: runtime: 1303.88ms ± 95.567ms; best: 1180.74ms; n=36
+Time: runtime: 5788.49ms ± 71.2364ms; best: 5660.38ms; n=45
+Time: runtime: 3537.47ms ± 59.8368ms; best: 3432.72ms; n=49
+Time: runtime: 2434.85ms ± 38.4084ms; best: 2382.31ms; n=30
+Time: runtime: 1802.55ms ± 97.6827ms; best: 1615.69ms; n=23
+Time: runtime: 1787.4ms ± 43.9987ms; best: 1720.63ms; n=48
diff --git a/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_strong_scaling b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_strong_scaling
new file mode 100644
index 00000000..f53c8111
--- /dev/null
+++ b/projects/2021/groupK_burgers_openmp/branch-upcxx/upcxx_strong_scaling
@@ -0,0 +1,10 @@
+Time: runtime: 4279.9ms ± 134.979ms; best: 4030.75ms; n=59
+Time: runtime: 1100.13ms ± 28.4733ms; best: 1066.17ms; n=35
+Time: runtime: 800.246ms ± 44.5372ms; best: 753.12ms; n=28
+Time: runtime: 546.107ms ± 17.7146ms; best: 521.13ms; n=27
+Time: runtime: 497.132ms ± 43.4852ms; best: 443.236ms; n=37
+Time: runtime: 4401.04ms ± 132.005ms; best: 4201.34ms; n=29
+Time: runtime: 2190.8ms ± 45.1649ms; best: 2107.44ms; n=38
+Time: runtime: 1183.65ms ± 24.1417ms; best: 1148.65ms; n=45
+Time: runtime: 725.589ms ± 21.6998ms; best: 688.194ms; n=60
+Time: runtime: 594.244ms ± 45.5923ms; best: 564.256ms; n=32