From efb27bf568366fd220412ebabe1818eabc6ec0ff Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Mon, 18 Mar 2024 14:48:20 +0100 Subject: [PATCH 001/108] Adding new macro for declaring enum classes with useful utils --- include/declare_enum.h | 60 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 include/declare_enum.h diff --git a/include/declare_enum.h b/include/declare_enum.h new file mode 100644 index 0000000000..175c47722e --- /dev/null +++ b/include/declare_enum.h @@ -0,0 +1,60 @@ +/* + * A macro that declares an `enum class` as well as a `to_string` function for the enums. + * The enum has also a default value `size` that measures the size of the enum. + * + * Credit: https://stackoverflow.com/a/71375077/12084612 + * ------- + * License: CC BY-SA 4.0 + * -------- + * Usage: + * ------ + * + * DECLARE_ENUM(WeekEnum, Mon, Tue, Wed, Thu, Fri, Sat, Sun,); + * + * int main() + * { + * WeekEnum weekDay = WeekEnum::Wed; + * std::cout << to_string(weekDay) << std::endl; // prints Wed + * std::cout << to_string(WeekEnum::Sat) << std::endl; // prints Sat + * std::cout << to_string((int) WeekEnum::size) << std::endl; // prints 7 + * return 0; + * } + * + */ + +#pragma once +#include +#include +#include +#include +#include + +// Add the definition of this method into a cpp file. (only the declaration in the header) +static inline const std::vector &get_enum_names(const std::string &en_key, const std::string &en_str) +{ + static std::unordered_map> en_names_map; + const auto it = en_names_map.find(en_key); + if (it != en_names_map.end()) return it->second; + + constexpr auto delim(','); + std::vector en_names; + std::size_t start {}; + auto end = en_str.find(delim); + while (end != std::string::npos) { + while (en_str[start] == ' ') ++start; + en_names.push_back(en_str.substr(start, end - start)); + start = end + 1; + end = en_str.find(delim, start); + } + while (en_str[start] == ' ') ++start; + en_names.push_back(en_str.substr(start)); + return en_names_map.emplace(en_key, std::move(en_names)).first->second; +} + +#define DECLARE_ENUM(ENUM_NAME, ...) \ + enum class ENUM_NAME : unsigned int { __VA_ARGS__ size }; \ + inline std::string to_string(ENUM_NAME en) \ + { \ + const auto &names = get_enum_names(#ENUM_NAME #__VA_ARGS__, #__VA_ARGS__); \ + return names[static_cast(en)]; \ + } From 2514ca715c3d8f977317ee1c33d57fcdfef409d3 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Mon, 18 Mar 2024 14:50:54 +0100 Subject: [PATCH 002/108] Adding DDParam structure used as property for any lattice_field --- include/domain_decomposition.h | 153 +++++++++++++++++++++++++++++++++ include/lattice_field.h | 35 ++++++++ 2 files changed, 188 insertions(+) create mode 100644 include/domain_decomposition.h diff --git a/include/domain_decomposition.h b/include/domain_decomposition.h new file mode 100644 index 0000000000..464cd3b2c0 --- /dev/null +++ b/include/domain_decomposition.h @@ -0,0 +1,153 @@ +#pragma once + +#include +#include "declare_enum.h" + +namespace quda +{ + + // using namespace quda; + + DECLARE_ENUM(DD, // name of the enum class + in_use, // whether DD is in use + + mode_red_black, // red-black DD, e.g. for SAP + red_active, // if red blocks are active + black_active, // if black blocks are active + rb_hopping, // if hopping between red and black is allowed + first_black, // the color of the first block of the local lattice + ); + + // Params for domain decompation + // we template over Int such that int_fastdiv can be used in kernels + template class DDParam + { + + public: + bool flags[(int)DD::size]; // the default value of all flags is 0 + Int blockDim[QUDA_MAX_DIM]; // the size of the block per direction + + // Default constructor + DDParam() : flags {0}, blockDim {0} { } + + // Default copy + DDParam(const DDParam &dd) = default; + + // Default copy + template DDParam(const DDParam &dd) + { +#pragma unroll + for (auto i = 0u; i < (int)DD::size; i++) flags[i] = dd.flags[i]; +#pragma unroll + for (auto i = 0u; i < QUDA_MAX_DIM; i++) blockDim[i] = dd.blockDim[i]; + } + + // sets the given flag to true + constexpr inline DDParam &operator&=(const DD &flag) + { + flags[(int)flag] = true; + return *this; + } + + // returns false if in use + constexpr inline bool operator!() const { return not flags[(int)DD::in_use]; } + + // returns value of given flag + constexpr inline bool operator&&(const DD &flag) const { return flags[(int)flag]; } + + constexpr inline bool is(const DD &flag) const { return flags[(int)flag]; } + + // Pretty print the args struct + void print() + { + if (not *this) { + printfQuda("DDParam not in use\n"); + return; + } + printfQuda("Printing DDParam\n"); + for (int i = 0; i < (int)DD::size; i++) + printfQuda("flags[DD::%s] = %s\n", to_string((DD)i).c_str(), flags[i] ? "true" : "false"); + for (int i = 0; i < QUDA_MAX_DIM; i++) printfQuda("blockDim[%d] = %d\n", i, static_cast(blockDim[i])); + } + + // template // if true, prints debug information + inline bool match(const DDParam &dd) const + { + bool debug = true; + // if one of the two is not in use we return true, i.e. one of the two is a full field + if (not *this or not dd) return true; + + // false if only one is red_black + if (is(DD::mode_red_black) ^ dd.is(DD::mode_red_black)) { + if (debug) printfQuda("Only one of the two is red_black\n"); + return false; + } + + if (is(DD::mode_red_black) and dd.is(DD::mode_red_black)) + for (int i = 0; i < QUDA_MAX_DIM; i++) + if (blockDim[i] != dd.blockDim[i]) { + if (debug) + printfQuda("blockDim[%d] = %d != %d \n", i, static_cast(blockDim[i]), + static_cast(dd.blockDim[i])); + return false; + } + + return true; + } + + /* if a field is zero at given coordinates */ + template constexpr inline bool isZero(const Coord &x, const Arg &arg) const + { + + // if DD not in use we return immidiatelly + if (not is(DD::in_use)) return false; + + if (is(DD::mode_red_black)) { + + // Computing block_parity: 0 = red, 1 = black + int block_parity = is(DD::first_black); + for (int i = 0; i < x.size(); i++) { block_parity += x[i] / blockDim[i]; } + block_parity %= 2; + + // Checking if my parity is active + if (not is(DD::red_active) and block_parity == 0) return true; + if (not is(DD::black_active) and block_parity == 1) return true; + } + + return false; + } + + /* if hopping is allowed */ + template + constexpr inline bool doHopping(const Coord &x, const int &mu, const int &dir, const Arg &arg) const + { + + // if DD not in use we return immidiatelly + if (not is(DD::in_use)) return true; + + if (is(DD::mode_red_black)) { + + // Checking if we are on the border + bool on_border = (dir > 0) ? ((x[mu] + 1) % blockDim[mu] == 0) : (x[mu] % blockDim[mu] == 0); + + // If all the follwing are true then it is a full operator + if (is(DD::red_active) and is(DD::black_active)) { + if (not on_border || is(DD::rb_hopping)) return true; + return false; + } + + // Computing block_parity: 0 = red, 1 = black + int block_parity = is(DD::first_black) + on_border; + for (int i = 0; i < x.size(); i++) { block_parity += x[i] / blockDim[i]; } + block_parity %= 2; + + // Checking if my parity is active + if (is(DD::red_active) and block_parity == 0) return true; + if (is(DD::black_active) and block_parity == 1) return true; + } + + return false; + } + }; + +} // namespace quda diff --git a/include/lattice_field.h b/include/lattice_field.h index 462add3bde..0e044fe330 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -7,6 +7,7 @@ #include #include #include +#include /** * @file lattice_field.h @@ -478,6 +479,9 @@ namespace quda { */ inline static int bufferIndex = 0; + /** Domain decomposition options */ + DDParam dd {}; + /** @brief Default constructor */ @@ -928,6 +932,37 @@ namespace quda { #define checkNative(...) Native_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** + @brief Helper function for determining if the domain decomposition of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @return true if all fields match + */ + template + inline bool DD_(const char *func, const char *file, int line, const T1 &a_, const T2 &b_) + { + const unwrap_t &a(a_); + const unwrap_t &b(b_); + if (!a.dd.match(b.dd)) errorQuda("DD not match (%s:%d in %s())", file, line, func); + return true; + } + + /** + @brief Helper function for determining if the domain decomposition of the fields is the same. + @param[in] a Input field + @param[in] b Input field + @param[in] args List of additional fields to check domain decomposition on + @return true if all fields match + */ + template + inline bool DD_(const char *func, const char *file, int line, const T1 &a, const T2 &b, const Args &...args) + { + // checking all possible pairs + return (DD_(func, file, line, a, b) & DD_(func, file, line, a, args...) & DD_(func, file, line, b, args...)); + } + +#define checkDD(...) DD_(__func__, __FILE__, __LINE__, __VA_ARGS__) + /** @brief Return whether data is reordered on the CPU or GPU. This can set at QUDA initialization using the environment variable From 1c89d4ff9b31c7bfb93cb4ab6475d6c417b6c80a Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Mon, 18 Mar 2024 14:54:13 +0100 Subject: [PATCH 003/108] Adding method size to Coord --- include/index_helper.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/index_helper.cuh b/include/index_helper.cuh index 990ff949e7..eae33e1e8d 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -236,6 +236,7 @@ namespace quda { int X; // full lattice site index constexpr const int& operator[](int i) const { return x[i]; } constexpr int& operator[](int i) { return x[i]; } + constexpr int size() const { return nDim; } }; /** From 82e3d6ca3c60e77e44e530e84461e6d9a22af225 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Mon, 18 Mar 2024 14:59:57 +0100 Subject: [PATCH 004/108] First implementation of DDaware dslash operator --- include/dslash_helper.cuh | 9 +++++++++ include/kernels/dslash_wilson.cuh | 30 +++++++++++++++++++++++------- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index e67582b682..bcd7bc6dce 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -11,6 +11,7 @@ #include #include #include +#include #if defined(_NVHPC_CUDA) #include @@ -286,6 +287,10 @@ namespace quda int exterior_dims; // dimension to run in the exterior Dslash int exterior_blocks; + DDParam dd_out; + DDParam dd_in; + DDParam dd_x; + // for shmem ... static constexpr bool packkernel = false; void *packBuffer[4 * QUDA_MAX_DIM]; @@ -337,6 +342,9 @@ namespace quda pack_blocks(0), exterior_dims(0), exterior_blocks(0), + dd_out(out.dd), + dd_in(in.dd), + dd_x(x.dd), #ifndef NVSHMEM_COMMS counter(0) #else @@ -353,6 +361,7 @@ namespace quda checkOrder(out, in, x); // check all orders match checkPrecision(out, in, x, U); // check all precisions match checkLocation(out, in, x, U); // check all locations match + checkDD(out, in, x); // check all DD match if (!in.isNative() || !U.isNative()) errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index f87e8f9865..c5a5b03303 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -73,7 +73,8 @@ namespace quda #pragma unroll for (int d = 0; d < 4; d++) { // loop over dimension - 4 and not nDim since this is used for DWF as well - { // Forward gather - compute fwd offset for vector fetch + // Forward gather - compute fwd offset for vector fetch + if (arg.dd_in.doHopping(coord, d, +1, arg.dc)) { const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc); const int gauge_idx = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb); constexpr int proj_dir = dagger ? +1 : -1; @@ -99,7 +100,8 @@ namespace quda } } - { // Backward gather - compute back offset for spinor and gauge fetch + // Backward gather - compute back offset for spinor and gauge fetch + if (arg.dd_in.doHopping(coord, d, -1, arg.dc)) { const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc); const int gauge_idx = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx); constexpr int proj_dir = dagger ? -1 : +1; @@ -148,15 +150,29 @@ namespace quda const int my_spinor_parity = nParity == 2 ? parity : 0; Vector out; - applyWilson(out, arg, coord, parity, idx, thread_dim, active); int xs = coord.x_cb + coord.s * arg.dc.volume_4d_cb; + if (arg.dd_out.isZero(coord, arg.dc)) { + if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out(xs, my_spinor_parity) = out; + return; + } + + applyWilson(out, arg, coord, parity, idx, thread_dim, active); + if (xpay && mykernel_type == INTERIOR_KERNEL) { - Vector x = arg.x(xs, my_spinor_parity); - out = x + arg.a * out; + if (arg.dd_x.isZero(coord, arg.dc)) { + out = arg.a * out; + } else { + Vector x = arg.x(xs, my_spinor_parity); + out = x + arg.a * out; + } } else if (mykernel_type != INTERIOR_KERNEL && active) { - Vector x = arg.out(xs, my_spinor_parity); - out = x + (xpay ? arg.a * out : out); + if (arg.dd_x.isZero(coord, arg.dc)) { + out = (xpay ? arg.a * out : out); + } else { + Vector x = arg.out(xs, my_spinor_parity); + out = x + (xpay ? arg.a * out : out); + } } if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out(xs, my_spinor_parity) = out; From 2ac39adde2f6bdcc48d1a8abf617972392928bab Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 19 Mar 2024 08:58:10 +0100 Subject: [PATCH 005/108] Added dd options for tests --- tests/utils/command_line_params.cpp | 20 ++++++++++++++++---- tests/utils/command_line_params.h | 4 ++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp index d9cf5227d1..8cfb639c2f 100644 --- a/tests/utils/command_line_params.cpp +++ b/tests/utils/command_line_params.cpp @@ -74,6 +74,9 @@ QudaInverterType precon_type = QUDA_INVALID_INVERTER; QudaSchwarzType precon_schwarz_type = QUDA_INVALID_SCHWARZ; QudaAcceleratorType precon_accelerator_type = QUDA_INVALID_ACCELERATOR; +std::array dd_block_size = {4, 4, 4, 4}; +bool dd_red_black = false; + double madwf_diagonal_suppressor = 0.0; int madwf_ls = 4; int madwf_null_miniter = niter; @@ -1029,7 +1032,7 @@ void add_multigrid_option_group(std::shared_ptr quda_app) void add_eofa_option_group(std::shared_ptr quda_app) { - auto opgroup = quda_app->add_option_group("EOFA", "Options controlling EOFA parameteres"); + auto opgroup = quda_app->add_option_group("EOFA", "Options controlling EOFA parameters"); CLI::TransformPairs eofa_pm_map {{"plus", 1}, {"minus", 0}}; opgroup->add_option("--eofa-pm", eofa_pm, "Set to evalute \"plus\" or \"minus\" EOFA operator (default plus)") @@ -1040,9 +1043,19 @@ void add_eofa_option_group(std::shared_ptr quda_app) opgroup->add_option("--eofa-mq3", eofa_mq1, "Set mq3 for EOFA operator (default 1.0)"); } +void add_dd_option_group(std::shared_ptr quda_app) +{ + auto opgroup = quda_app->add_option_group("DD", "Options controlling DD parameters"); + opgroup + ->add_option("--dd-block-size", dd_block_size, + "Set the domain decomposition block size in all four dimension (default 4 4 4 4)") + ->expected(4); + opgroup->add_option("--dd-red-black", dd_red_black, "Enable red-black domain decomposition (default false)"); +} + void add_madwf_option_group(std::shared_ptr quda_app) { - auto opgroup = quda_app->add_option_group("MADWF", "Options controlling MADWF parameteres"); + auto opgroup = quda_app->add_option_group("MADWF", "Options controlling MADWF parameters"); opgroup->add_option("--madwf-diagonal-suppressor", madwf_diagonal_suppressor, "Set the digonal suppressor for MADWF (default 0)"); @@ -1103,8 +1116,7 @@ void add_gaugefix_option_group(std::shared_ptr quda_app) void add_comms_option_group(std::shared_ptr quda_app) { - auto opgroup - = quda_app->add_option_group("Communication", "Options controlling communication (split grid) parameteres"); + auto opgroup = quda_app->add_option_group("Communication", "Options controlling communication (split grid) parameters"); opgroup->add_option("--grid-partition", grid_partition, "Set the grid partition (default 1 1 1 1)")->expected(4); } diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h index fb52e5764d..4e88d9313e 100644 --- a/tests/utils/command_line_params.h +++ b/tests/utils/command_line_params.h @@ -133,6 +133,7 @@ void add_eigen_option_group(std::shared_ptr quda_app); void add_deflation_option_group(std::shared_ptr quda_app); void add_multigrid_option_group(std::shared_ptr quda_app); void add_eofa_option_group(std::shared_ptr quda_app); +void add_dd_option_group(std::shared_ptr quda_app); void add_madwf_option_group(std::shared_ptr quda_app); void add_su3_option_group(std::shared_ptr quda_app); void add_heatbath_option_group(std::shared_ptr quda_app); @@ -211,6 +212,9 @@ extern QudaInverterType precon_type; extern QudaSchwarzType precon_schwarz_type; extern QudaAcceleratorType precon_accelerator_type; +extern std::array dd_block_size; +extern bool dd_red_black; + extern double madwf_diagonal_suppressor; extern int madwf_ls; extern int madwf_null_miniter; From 3945185153b8de44df95e9d6970ec25fd5ec36ca Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 19 Mar 2024 08:58:30 +0100 Subject: [PATCH 006/108] Added set and reset functions --- include/domain_decomposition.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/include/domain_decomposition.h b/include/domain_decomposition.h index 464cd3b2c0..c9b2ca039b 100644 --- a/include/domain_decomposition.h +++ b/include/domain_decomposition.h @@ -42,21 +42,35 @@ namespace quda for (auto i = 0u; i < QUDA_MAX_DIM; i++) blockDim[i] = dd.blockDim[i]; } - // sets the given flag to true - constexpr inline DDParam &operator&=(const DD &flag) - { - flags[(int)flag] = true; - return *this; - } - // returns false if in use constexpr inline bool operator!() const { return not flags[(int)DD::in_use]; } // returns value of given flag - constexpr inline bool operator&&(const DD &flag) const { return flags[(int)flag]; } + constexpr inline bool operator&&(const DD &flag) const { return flags[(int)DD::in_use] && flags[(int)flag]; } + // returns value of given flag constexpr inline bool is(const DD &flag) const { return flags[(int)flag]; } + constexpr inline void set(const DD &flag) { flags[(int)flag] = true; } + + template constexpr inline void set(const DD &flag, Args... args) + { + set(flag); + set(args...); + } + + constexpr inline void reset() + { +#pragma unroll + for (auto i = 0u; i < (int)DD::size; i++) flags[i] = 0; + } + + template constexpr inline void reset(Args... args) + { + reset(); + set(args...); + } + // Pretty print the args struct void print() { From c116ff84f0ea1ac1c60e5a4ca38d814bc5fa1192 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 19 Mar 2024 08:59:26 +0100 Subject: [PATCH 007/108] Addind test routines for domain decomposition --- tests/dslash_test.cpp | 1 + tests/dslash_test_utils.h | 40 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp index 11f8265e7f..107d2ccba1 100644 --- a/tests/dslash_test.cpp +++ b/tests/dslash_test.cpp @@ -89,6 +89,7 @@ int main(int argc, char **argv) auto app = make_app(); app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map)); add_eofa_option_group(app); + add_dd_option_group(app); add_comms_option_group(app); try { diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index 3d13d11478..d1871248ef 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -62,6 +62,7 @@ struct DslashTestWrapper { // CUDA color spinor fields ColorSpinorField cudaSpinor; ColorSpinorField cudaSpinorOut; + ColorSpinorField cudaSpinorTmp; // Dirac pointers quda::Dirac *dirac = nullptr; @@ -81,6 +82,7 @@ struct DslashTestWrapper { QudaParity parity = QUDA_EVEN_PARITY; static inline dslash_test_type dtest_type = dslash_test_type::Dslash; static inline bool test_split_grid = false; + static inline bool test_domain_decomposition = false; int num_src = 1; const bool transfer = false; @@ -144,6 +146,8 @@ struct DslashTestWrapper { if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match"); + test_domain_decomposition = dd_red_black; + for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i]; num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3]; test_split_grid = num_src > 1; @@ -291,6 +295,7 @@ struct DslashTestWrapper { cudaSpinor = ColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut with nParity = %d\n", csParam.siteSubset); cudaSpinorOut = ColorSpinorField(csParam); + if (test_domain_decomposition) { cudaSpinorTmp = ColorSpinorField(csParam); } printfQuda("Sending spinor field to GPU\n"); cudaSpinor = spinor; @@ -790,6 +795,41 @@ struct DslashTestWrapper { dslashMultiSrcQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity, hostGauge, &gauge_param); } + } else if (test_domain_decomposition) { + + if (dd_red_black) { + for (auto i = 0u; i < 4; i++) { + cudaSpinor.dd.blockDim[i] = dd_block_size[i]; + cudaSpinorTmp.dd.blockDim[i] = dd_block_size[i]; + } + + blas::zero(cudaSpinorOut); + + // We test it applying all 4 possible operators Drr, Drb, Dbr, Dbb + for (int col = 0; col < 4; col++) { + + cudaSpinor.dd.reset(col % 2 == 0 ? DD::red_active : DD::black_active); + cudaSpinorTmp.dd.reset(col / 2 == 0 ? DD::red_active : DD::black_active); + + switch (dtest_type) { + case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorTmp, cudaSpinor, parity); break; + case dslash_test_type::MatPC: + case dslash_test_type::Mat: dirac->M(cudaSpinorTmp, cudaSpinor); break; + case dslash_test_type::MatPCDagMatPC: + case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorTmp, cudaSpinor); break; + default: + errorQuda("Test type %s not support for current Dslash", get_string(dtest_type_map, dtest_type).c_str()); + } + + cudaSpinor.dd.reset(); + cudaSpinorTmp.dd.reset(); + + blas::xpy(cudaSpinorTmp, cudaSpinorOut); + } + } else { + errorQuda("Test dd type not supported"); + } + } else { for (int i = 0; i < niter; i++) { From 9ef487173f431e22f2e27c1b858821eab76b2d9c Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 19 Mar 2024 09:13:44 +0100 Subject: [PATCH 008/108] Enabling DD::in_use by default --- include/domain_decomposition.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/domain_decomposition.h b/include/domain_decomposition.h index c9b2ca039b..a8d4cc3258 100644 --- a/include/domain_decomposition.h +++ b/include/domain_decomposition.h @@ -68,7 +68,7 @@ namespace quda template constexpr inline void reset(Args... args) { reset(); - set(args...); + set(DD::in_use, args...); } // Pretty print the args struct From 5435b960d2d901f15abf7583e8d85b1b880beaa3 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 19 Mar 2024 09:13:57 +0100 Subject: [PATCH 009/108] Adding missing DD::mode --- tests/dslash_test_utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index d1871248ef..50b866fa75 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -808,8 +808,8 @@ struct DslashTestWrapper { // We test it applying all 4 possible operators Drr, Drb, Dbr, Dbb for (int col = 0; col < 4; col++) { - cudaSpinor.dd.reset(col % 2 == 0 ? DD::red_active : DD::black_active); - cudaSpinorTmp.dd.reset(col / 2 == 0 ? DD::red_active : DD::black_active); + cudaSpinor.dd.reset(DD::mode_red_black, col % 2 == 0 ? DD::red_active : DD::black_active); + cudaSpinorTmp.dd.reset(DD::mode_red_black, col / 2 == 0 ? DD::red_active : DD::black_active); switch (dtest_type) { case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorTmp, cudaSpinor, parity); break; From 66c7ae6be897b82c07d1a45e2bc14c5cc7d33229 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 19 Mar 2024 09:14:29 +0100 Subject: [PATCH 010/108] Example of usage of DD in SAP with MR solver --- include/invert_quda.h | 12 ++++++++++++ lib/inv_mr_quda.cpp | 22 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/invert_quda.h b/include/invert_quda.h index 8b46148c96..5dc83ce7bf 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -235,6 +235,10 @@ namespace quda { /** Whether to use additive or multiplicative Schwarz preconditioning */ QudaSchwarzType schwarz_type = QUDA_INVALID_SCHWARZ; + /** The size of a block per direction in the Schwarz procedure + (default = 0, i.e. local volume, old implementation) */ + int schwarz_block[QUDA_MAX_DIM] = {0}; + /** The type of accelerator type to use for preconditioner */ QudaAcceleratorType accelerator_type_precondition = QUDA_INVALID_ACCELERATOR; @@ -400,6 +404,14 @@ namespace quda { // for incremental eigCG: void updateRhsIndex(QudaInvertParam ¶m) { rhs_idx = param.rhs_idx; } + + constexpr inline bool do_block_schwarz() const + { + if (schwarz_type == QUDA_INVALID_SCHWARZ) return false; + for (int i = 0; i < QUDA_MAX_DIM; i++) + if (schwarz_block[i] > 0) return true; + return false; + } }; class Solver { diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index df8f0f2dbc..3e744fa2c6 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -88,12 +88,25 @@ namespace quda int k = 0; double scale = 1.0; - if ((node_parity + step) % 2 == 0 && param.schwarz_type == QUDA_MULTIPLICATIVE_SCHWARZ) { + if (!param.do_block_schwarz() && param.schwarz_type == QUDA_MULTIPLICATIVE_SCHWARZ + && (node_parity + step) % 2 == 0) { // for multiplicative Schwarz we alternate updates depending on node parity } else { commGlobalReductionPush(param.global_reduction); // use local reductions for DD solver + if (param.do_block_schwarz()) { + if (param.schwarz_type == QUDA_MULTIPLICATIVE_SCHWARZ) { + // Red or black active + Ar.dd.reset(DD::mode_red_black, step % 2 == 0 ? DD::red_active : DD::black_active); + r_sloppy.dd.reset(DD::mode_red_black, step % 2 == 0 ? DD::red_active : DD::black_active); + } else { + // Both red and black active but no hopping + Ar.dd.reset(DD::mode_red_black, DD::red_active, DD::black_active); + r_sloppy.dd.reset(DD::mode_red_black, DD::red_active, DD::black_active); + } + } + blas::zero(x_sloppy); // can get rid of this for a special first update kernel double c2 = param.global_reduction == QUDA_BOOLEAN_TRUE ? r2 : blas::norm2(r); // c2 holds the initial r2 scale = c2 > 0.0 ? sqrt(c2) : 1.0; @@ -119,6 +132,7 @@ namespace quda } else { // doing local reductions so can make it asynchronous commAsyncReductionSet(true); + // TODO: make these block aware blas::cDotProductNormA(Ar, r_sloppy); // omega*alpha is done in the kernel @@ -132,6 +146,12 @@ namespace quda blas::axpy(scale, x_sloppy, x); // Scale and sum to accumulator + if (param.do_block_schwarz()) { + // Disable domain decomposition + Ar.dd.reset(); + r_sloppy.dd.reset(); + } + commGlobalReductionPop(); // renable global reductions for outer solver } From 20032120b8dd4f56e0b4adf6975deca532cb25b0 Mon Sep 17 00:00:00 2001 From: Ferenc Pittler Date: Thu, 4 Apr 2024 13:51:28 +0200 Subject: [PATCH 011/108] implementing projection of ColorSpinorField to a domain determined by ddParam --- include/color_spinor_field.h | 10 +++++++ lib/color_spinor_field.cpp | 23 +++++++++++++++ lib/color_spinor_util.in.cu | 54 ++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 453a5e8df4..cbc41eae7e 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -149,6 +149,7 @@ namespace quda int composite_dim = 0; // e.g., number of eigenvectors in the set bool is_component = false; int component_id = 0; // eigenvector index + DDParam dd {}; /** If using CUDA native fields, this function will ensure that the @@ -438,6 +439,13 @@ namespace quda */ void copy(const ColorSpinorField &src); + /** + @brief Project the field to a domain determined by + ddParam + */ + + void projectDD(); + /** @brief Zero all elements of this field */ @@ -946,6 +954,8 @@ namespace quda void *Dst = nullptr, const void *Src = nullptr); void genericSource(ColorSpinorField &a, QudaSourceType sourceType, int x, int s, int c); + + void genericProjectDD(ColorSpinorField &a); int genericCompare(const ColorSpinorField &a, const ColorSpinorField &b, int tol); /** diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index fba73f7b62..d0f558f533 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -98,6 +98,7 @@ namespace quda nSpin = param.nSpin; nVec = param.nVec; twistFlavor = param.twistFlavor; + dd = param.dd; if (param.pc_type != QUDA_5D_PC && param.pc_type != QUDA_4D_PC) errorQuda("Unexpected pc_type %d", param.pc_type); pc_type = param.pc_type; @@ -527,6 +528,7 @@ namespace quda param.pc_type = pc_type; param.suggested_parity = suggested_parity; param.create = QUDA_NULL_FIELD_CREATE; + param.dd = dd; } void ColorSpinorField::exchange(void **ghost, void **sendbuf, int nFace) const @@ -1532,6 +1534,27 @@ namespace quda genericPrintVector(*this, parity, x_cb, rank); } + void ColorSpinorField::projectDD() + { + if (Location() == QUDA_CPU_FIELD_LOCATION) { + genericProjectDD(*this); + } else { + ColorSpinorParam param(*this); + param.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + param.location = QUDA_CPU_FIELD_LOCATION; + param.setPrecision((param.Precision() == QUDA_HALF_PRECISION || param.Precision() == QUDA_QUARTER_PRECISION) ? + QUDA_SINGLE_PRECISION : + param.Precision()); + param.create = QUDA_COPY_FIELD_CREATE; + // since CPU fields cannot be low precision, use single precision instead + if (precision < QUDA_SINGLE_PRECISION) param.setPrecision(QUDA_SINGLE_PRECISION, QUDA_INVALID_PRECISION, false); + + ColorSpinorField tmp(param); + tmp.projectDD(); + *this = tmp; + } + } + int ColorSpinorField::Compare(const ColorSpinorField &a, const ColorSpinorField &b, const int tol) { if (checkLocation(a, b) == QUDA_CUDA_FIELD_LOCATION) errorQuda("device field not implemented"); diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu index 0b9355d4d1..9dfbad1fb7 100644 --- a/lib/color_spinor_util.in.cu +++ b/lib/color_spinor_util.in.cu @@ -423,4 +423,58 @@ namespace quda { resize(v, new_size, param); } + template void projectDD(P &p, const ColorSpinorField &meta) + { + Coord<4> coord; + int X[4] = {meta.X(0), meta.X(1), meta.X(2), meta.X(3)}; + X[0] *= (p.Nparity() == 1) ? 2 : 1; // need full lattice dims + + for (int parity = 0; parity < p.Nparity(); parity++) { + for (int x_cb = 0; x_cb < p.VolumeCB(); x_cb++) { + getCoords(coord, x_cb, X, parity); + + if (meta.dd.isZero(coord, meta.getDslashConstant())) { + // printfQuda("Should be zero %d\n",x_cb); + for (int s = 0; s < p.Nspin(); s++) + for (int c = 0; c < p.Ncolor(); c++) p(parity, x_cb, s, c) = 0; + } else { + // printfQuda("Should not be zero %d\n",x_cb); + } + } + } + } + + template void genericProjectDD(ColorSpinorField &a) + { + FieldOrderCB A(a); + projectDD(A, a); + } + + template void genericProjectDD(ColorSpinorField &a) + { + + switch (a.Nspin()) { + case (1): + if constexpr (is_enabled_spin(1)) genericProjectDD(a); + break; + + case (2): + if constexpr (is_enabled_spin(2)) genericProjectDD(a); + break; + case (4): + if constexpr (is_enabled_spin(4)) genericProjectDD(a); + break; + default: errorQuda("Nspin %d not implemented", a.Nspin()); + } + } + + void genericProjectDD(ColorSpinorField &a) + { + switch (a.Precision()) { + case QUDA_DOUBLE_PRECISION: genericProjectDD(a); break; + case QUDA_SINGLE_PRECISION: genericProjectDD(a); break; + default: errorQuda("Precision %d not implemented", a.Precision()); + } + } + } // namespace quda From f76859b14f71fffb76d5bf9f521d673fb1398388 Mon Sep 17 00:00:00 2001 From: Ferenc Pittler Date: Thu, 4 Apr 2024 13:52:30 +0200 Subject: [PATCH 012/108] implementing tests for projection on domain determined by ddParam --- tests/dslash_test_utils.h | 41 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h index 50b866fa75..d26f8ca662 100644 --- a/tests/dslash_test_utils.h +++ b/tests/dslash_test_utils.h @@ -63,6 +63,8 @@ struct DslashTestWrapper { ColorSpinorField cudaSpinor; ColorSpinorField cudaSpinorOut; ColorSpinorField cudaSpinorTmp; + ColorSpinorField cudaSpinorTmp2; + ColorSpinorField cudaSpinorTmp3; // Dirac pointers quda::Dirac *dirac = nullptr; @@ -295,7 +297,11 @@ struct DslashTestWrapper { cudaSpinor = ColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut with nParity = %d\n", csParam.siteSubset); cudaSpinorOut = ColorSpinorField(csParam); - if (test_domain_decomposition) { cudaSpinorTmp = ColorSpinorField(csParam); } + if (test_domain_decomposition) { + cudaSpinorTmp = ColorSpinorField(csParam); + cudaSpinorTmp2 = ColorSpinorField(csParam); + cudaSpinorTmp3 = ColorSpinorField(csParam); + } printfQuda("Sending spinor field to GPU\n"); cudaSpinor = spinor; @@ -801,6 +807,8 @@ struct DslashTestWrapper { for (auto i = 0u; i < 4; i++) { cudaSpinor.dd.blockDim[i] = dd_block_size[i]; cudaSpinorTmp.dd.blockDim[i] = dd_block_size[i]; + cudaSpinorTmp2.dd.blockDim[i] = dd_block_size[i]; + cudaSpinorTmp3.dd.blockDim[i] = dd_block_size[i]; } blas::zero(cudaSpinorOut); @@ -824,6 +832,33 @@ struct DslashTestWrapper { cudaSpinor.dd.reset(); cudaSpinorTmp.dd.reset(); + cudaSpinorTmp2 = spinor; + cudaSpinorTmp2.dd.reset(DD::mode_red_black, col % 2 == 0 ? DD::red_active : DD::black_active); + cudaSpinorTmp2.projectDD(); + cudaSpinorTmp2.dd.reset(); + + switch (dtest_type) { + case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorTmp3, cudaSpinorTmp2, parity); break; + case dslash_test_type::MatPC: + case dslash_test_type::Mat: dirac->M(cudaSpinorTmp3, cudaSpinorTmp2); break; + case dslash_test_type::MatPCDagMatPC: + case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorTmp3, cudaSpinorTmp2); break; + default: + errorQuda("Test type %s not support for current Dslash", get_string(dtest_type_map, dtest_type).c_str()); + } + + cudaSpinorTmp3.dd.reset(DD::mode_red_black, col / 2 == 0 ? DD::red_active : DD::black_active); + cudaSpinorTmp3.projectDD(); + cudaSpinorTmp3.dd.reset(); + + spinorTmp = cudaSpinorTmp; + spinorOut = cudaSpinorTmp3; + + double deviation = std::pow(10, -(double)(ColorSpinorField::Compare(spinorTmp, spinorOut))); + printfQuda("Deviaton source %d sink %d %e\n", col % 2, col / 2, deviation); + + // here test cudaSpinorTmp==cudaSpinorTmp2 + blas::xpy(cudaSpinorTmp, cudaSpinorOut); } } else { @@ -1079,6 +1114,10 @@ struct DslashTestWrapper { size_t ghost_bytes = cudaSpinor.GhostBytes(); + printfQuda("dslash time event time %e %e %e\n", dslash_time.event_time, dslash_time.cpu_time, dslash_time.cpu_max); + printfQuda("niter %d\n", niter); + printfQuda("ghost_bytes %e\n", ghost_bytes); + printfQuda("Effective halo bi-directional bandwidth (GB/s) GPU = %f ( CPU = %f, min = %f , max = %f ) for " "aggregate message size %lu bytes\n", 1.0e-9 * 2 * ghost_bytes * niter / dslash_time.event_time, From 99fa6434770eadf67b2f20027cca16282c9d9b92 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 7 May 2024 09:10:30 +0200 Subject: [PATCH 013/108] Adding flags QUDA_DIRAC_DD and GPU_DD_DIRAC --- CMakeLists.txt | 2 ++ include/quda_define.h.in | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 03ea9f4fba..8d9ce6c032 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -151,6 +151,8 @@ option(QUDA_DIRAC_CLOVER_HASENBUSCH "build clover Hasenbusch twist operators" ${ option(QUDA_DIRAC_NDEG_TWISTED_MASS "build non-degenerate twisted mass Dirac operators" ${QUDA_DIRAC_DEFAULT}) option(QUDA_DIRAC_NDEG_TWISTED_CLOVER "build non-degenerate twisted clover Dirac operators" ${QUDA_DIRAC_DEFAULT}) +option(QUDA_DIRAC_DD "build code for domain decomposition of the Dirac operator" OFF) + option(QUDA_CONTRACT "build code for bilinear contraction" OFF) option(QUDA_COVDEV "build code for covariant derivative" OFF) diff --git a/include/quda_define.h.in b/include/quda_define.h.in index 445d3cd544..2692833f38 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -128,6 +128,15 @@ #define GPU_LAPLACE #endif +#cmakedefine QUDA_DIRAC_DD +#ifdef QUDA_DIRAC_DD +/** + * @def GPU_DD_DIRAC + * @brief This macro is set when we have DD-aware Dirac operator enabled + */ +#define GPU_DD_DIRAC +#endif + #cmakedefine QUDA_COVDEV #ifdef QUDA_COVDEV /** From d4c448ad08873c9a5e3dafafdef0a6b1748b7f23 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 7 May 2024 12:26:57 +0200 Subject: [PATCH 014/108] Adding instantiation over DDArg --- include/dslash_helper.cuh | 8 ++++---- include/enum_quda.h | 2 ++ include/instantiate_dslash.h | 24 +++++++++++++++++++++++- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index bcd7bc6dce..ee3846e344 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -242,7 +242,7 @@ namespace quda return true; } - template struct DslashArg { + template struct DslashArg { using Float = Float_; using real = typename mapper::type; @@ -287,9 +287,9 @@ namespace quda int exterior_dims; // dimension to run in the exterior Dslash int exterior_blocks; - DDParam dd_out; - DDParam dd_in; - DDParam dd_x; + DDArg dd_out; + DDArg dd_in; + DDArg dd_x; // for shmem ... static constexpr bool packkernel = false; diff --git a/include/enum_quda.h b/include/enum_quda.h index 60c186eb77..3548b8e8a3 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -590,6 +590,8 @@ typedef enum QudaExtLibType_s { QUDA_EXTLIB_INVALID = QUDA_INVALID_ENUM } QudaExtLibType; +typedef enum QudaDDType_s { QUDA_DD_NO, QUDA_DD_RED_BLACK, QUDA_DD_INVALID = QUDA_INVALID_ENUM } QudaDDType; + #ifdef __cplusplus } #endif diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h index d9c8144de4..1c96e2e7ea 100644 --- a/include/instantiate_dslash.h +++ b/include/instantiate_dslash.h @@ -17,7 +17,7 @@ namespace quda @param[in] args Additional arguments for different dslash kernels */ template