diff --git a/CMakeLists.txt b/CMakeLists.txt
index abcf27de99..4043970062 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,8 +151,8 @@ option(QUDA_DIRAC_CLOVER_HASENBUSCH "build clover Hasenbusch twist operators" ${
 option(QUDA_DIRAC_NDEG_TWISTED_MASS "build non-degenerate twisted mass Dirac operators" ${QUDA_DIRAC_DEFAULT})
 option(QUDA_DIRAC_NDEG_TWISTED_CLOVER "build non-degenerate twisted clover Dirac operators" ${QUDA_DIRAC_DEFAULT})
 option(QUDA_DIRAC_LAPLACE "build laplace operator" ${QUDA_DIRAC_DEFAULT})
-
 option(QUDA_DIRAC_DISTANCE_PRECONDITIONING "build code for distance preconditioned Wilson/clover Dirac operators" OFF)
+set(QUDA_DOMAIN_DECOMPOSITION "0" CACHE STRING "which domain decomposition to instantiate in QUDA (1-bit number - RedBlack)")
 
 option(QUDA_COVDEV "build code for covariant derivative" OFF)
 
diff --git a/ci/docker/Dockerfile.build b/ci/docker/Dockerfile.build
index 5ca1d7a3dc..3019b28c38 100644
--- a/ci/docker/Dockerfile.build
+++ b/ci/docker/Dockerfile.build
@@ -39,6 +39,7 @@ RUN  QUDA_TEST_GRID_SIZE="1 1 1 2" cmake -S /quda/src \
     -DQUDA_MULTIGRID_NVEC_LIST=6 \
     -DQUDA_MDW_FUSED_LS_LIST=4 \
     -DQUDA_MPI=ON  \
+    -DQUDA_DSLASH_DISTANCE=1 \
     -DQUDA_DIRAC_DEFAULT_OFF=ON \
     -DQUDA_DIRAC_WILSON=ON \
     -DQUDA_DIRAC_CLOVER=ON \
diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h
index 9edc8fbdef..6ce0282049 100644
--- a/include/color_spinor_field.h
+++ b/include/color_spinor_field.h
@@ -150,6 +150,7 @@ namespace quda
     int composite_dim = 0; // e.g., number of eigenvectors in the set
     bool is_component = false;
     int component_id = 0; // eigenvector index
+    DDParam dd {};
 
     /**
        If using CUDA native fields, this function will ensure that the
@@ -367,6 +368,9 @@ namespace quda
     //
     CompositeColorSpinorField components;
 
+    /** Domain decomposition options */
+    DDParam dd {};
+
     /**
        Compute the required extended ghost zone sizes and offsets
        @param[in] nFace The depth of the halo
@@ -449,6 +453,31 @@ namespace quda
      */
     void copy(const ColorSpinorField &src);
 
+    /**
+       @brief Project the field to a domain determined by DDParam
+     */
+    void projectDD();
+
+    /**
+       @brief Returns DDParam (const version)
+     */
+    const DDParam& DD() const { return dd; }
+
+    /**
+       @brief Returns DDParam (non const version)
+     */
+    DDParam& DD() { return dd; }
+
+    /**
+       @brief Sets DDParam from a given DDParam
+     */
+    void DD(const DDParam &in) { dd = in; }
+
+    /**
+       @brief Sets DDParam from a given list of options (DD flags)
+     */
+    template <typename... Args> void DD(const quda::DD &flag, const Args &...args) { dd.set(flag, args...); }
+
     /**
        @brief Zero all elements of this field
      */
@@ -993,6 +1022,8 @@ namespace quda
                               void *Dst = nullptr, const void *Src = nullptr);
 
   void genericSource(ColorSpinorField &a, QudaSourceType sourceType, int x, int s, int c);
+
+  void genericProjectDD(ColorSpinorField &a);
   int genericCompare(const ColorSpinorField &a, const ColorSpinorField &b, int tol);
 
   /**
diff --git a/include/declare_enum.h b/include/declare_enum.h
new file mode 100644
index 0000000000..69b721b343
--- /dev/null
+++ b/include/declare_enum.h
@@ -0,0 +1,60 @@
+/*
+ * A macro that declares an `enum class` as well as a `to_string` function for the enums.
+ * The enum has also a default value `size` that measures the size of the enum.
+ *
+ * Credit: https://stackoverflow.com/a/71375077/12084612
+ * -------
+ * License: CC BY-SA 4.0
+ * --------
+ * Usage:
+ * ------
+ *
+ * DECLARE_ENUM(WeekEnum, Mon, Tue, Wed, Thu, Fri, Sat, Sun,);
+ *
+ * int main()
+ * {
+ *     WeekEnum weekDay = WeekEnum::Wed;
+ *     std::cout << to_string(weekDay) << std::endl; // prints Wed
+ *     std::cout << to_string(WeekEnum::Sat) << std::endl; // prints Sat
+ *     std::cout << to_string((int) WeekEnum::size) << std::endl; // prints 7
+ *     return 0;
+ * }
+ *
+ */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+
+// Add the definition of this method into a cpp file. (only the declaration in the header)
+static inline const std::vector<std::string> get_enum_names(const std::string &en_key, const std::string &en_str)
+{
+  static std::unordered_map<std::string, std::vector<std::string>> en_names_map;
+  const auto it = en_names_map.find(en_key);
+  if (it != en_names_map.end()) return it->second;
+
+  constexpr auto delim(',');
+  std::vector<std::string> en_names;
+  std::size_t start {};
+  auto end = en_str.find(delim);
+  while (end != std::string::npos) {
+    while (en_str[start] == ' ') ++start;
+    en_names.push_back(en_str.substr(start, end - start));
+    start = end + 1;
+    end = en_str.find(delim, start);
+  }
+  while (en_str[start] == ' ') ++start;
+  en_names.push_back(en_str.substr(start));
+  return en_names_map.emplace(en_key, std::move(en_names)).first->second;
+}
+
+#define DECLARE_ENUM(ENUM_NAME, ...)                                                                                   \
+  enum class ENUM_NAME : unsigned int { __VA_ARGS__ size };                                                            \
+  inline std::string to_string(ENUM_NAME en)                                                                           \
+  {                                                                                                                    \
+    const auto names = get_enum_names(#ENUM_NAME, #__VA_ARGS__);                                                       \
+    return names[static_cast<std::size_t>(en)];                                                                        \
+  }
diff --git a/include/domain_decomposition.h b/include/domain_decomposition.h
new file mode 100644
index 0000000000..24e653ac37
--- /dev/null
+++ b/include/domain_decomposition.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include "declare_enum.h"
+
+namespace quda
+{
+
+  // using namespace quda;
+
+  DECLARE_ENUM(DD, // name of the enum class
+
+               reset, // No domain decomposition. It sets all flags to zero.
+
+               red_black_type,   // Flags used by red_black
+               red_active,       // if red blocks are active
+               black_active,     // if black blocks are active
+               no_block_hopping, // if hopping between red and black is allowed
+  );
+
+  // Params for domain decompation
+  struct DDParam {
+
+    QudaDDType type = QUDA_DD_NO;
+    array<bool, static_cast<int>(DD::size)> flags = {}; // the default value of all flags is 0
+    array<int, QUDA_MAX_DIM> block_dim = {};            // the size of the block per direction
+
+    // Default constructor
+    DDParam() = default;
+
+    // returns false if in use
+    constexpr bool operator!() const { return type == QUDA_DD_NO; }
+
+    // returns value of given flag
+    constexpr bool is(const DD &flag) const { return flags[(int)flag]; }
+
+    // sets given flag to true
+    constexpr void set(const DD &flag)
+    {
+      flags[(int)flag] = true;
+
+      if ((int)flag == (int)DD::reset) {
+#pragma unroll
+        for (auto i = 0u; i < (int)DD::size; i++) flags[i] = 0;
+        type = QUDA_DD_NO;
+      } else if ((int)flag >= (int)DD::red_black_type) {
+        type = QUDA_DD_RED_BLACK;
+      }
+    }
+
+    template <typename... Args> constexpr void set(const DD &flag, const Args &...args)
+    {
+      set(flag);
+      set(args...);
+    }
+
+    // Pretty print the args struct
+    void print() const
+    {
+      if (not *this) {
+        printfQuda("DD not in use\n");
+        return;
+      }
+      printfQuda("Printing DDParam\n");
+      for (int i = 0; i < (int)DD::size; i++)
+        printfQuda("flags[DD::%s] = %s\n", to_string((DD)i).c_str(), flags[i] ? "true" : "false");
+      for (int i = 0; i < QUDA_MAX_DIM; i++) printfQuda("block_dim[%d] = %d\n", i, static_cast<int>(block_dim[i]));
+    }
+
+    // Checks if this matches to given DDParam
+    template <typename F> inline bool check(const F &field, bool verbose = false) const
+    {
+      if (not *this) return true;
+
+      if (type == QUDA_DD_RED_BLACK) {
+        for (int i = 0; i < field.Ndim(); i++) {
+          if (block_dim[i] < 0) {
+            if (verbose) printfQuda("block_dim[%d] = %d is negative\n", i, block_dim[i]);
+            return false;
+          }
+          if (block_dim[i] > 0) {
+            int globalDim = comm_dim(i) * field.full_dim(i);
+            if (globalDim % block_dim[i] != 0) {
+              if (verbose) printfQuda("block_dim[%d] = %d does not divide %d \n", i, block_dim[i], globalDim);
+              return false;
+            }
+            if ((globalDim / block_dim[i]) % 2 != 0) {
+              if (verbose)
+                printfQuda("block_dim[%d] = %d does not divide %d **evenly** \n", i, block_dim[i], globalDim);
+              return false;
+            }
+          }
+        }
+        if (block_dim[0] % 2) {
+          if (verbose) printfQuda("block_dim[0] = %d must be even \n", block_dim[0]);
+          return false;
+        }
+      }
+
+      return true;
+    }
+
+    // Checks if this matches to given DDParam
+    inline bool match(const DDParam &dd, bool verbose = false) const
+    {
+      // if one of the two is not in use we return true, i.e. one of the two is a full field
+      if (not *this or not dd) return true;
+
+      // false if type does not match
+      if (type != dd.type) {
+        if (verbose) printfQuda("DD type do not match (%d != %d)\n", type, dd.type);
+        return false;
+      }
+
+      if (type == QUDA_DD_RED_BLACK) {
+        for (int i = 0; i < QUDA_MAX_DIM; i++)
+          if (block_dim[i] != dd.block_dim[i]) {
+            if (verbose) printfQuda("block_dim[%d] = %d != %d \n", i, block_dim[i], dd.block_dim[i]);
+            return false;
+          }
+        if (is(DD::no_block_hopping) != dd.is(DD::no_block_hopping)) {
+          if (verbose) printfQuda("no_block_hopping do not match.\n");
+          return false;
+        }
+      }
+
+      return true;
+    }
+
+    // Checks if this is equal to given DDParam
+    inline bool operator==(const DDParam &dd) const
+    {
+      // if both are not in use we return true
+      if (not *this and not dd) return true;
+
+      // false if type does not match
+      if (type != dd.type) return false;
+
+      // checking all flags matches (note this should be actually type-wise)
+      for (int i = 0; i < (int)DD::size; i++)
+        if (flags[i] != dd.flags[i]) return false;
+
+      // checking block_dim matches when needed
+      if (type == QUDA_DD_RED_BLACK)
+        for (int i = 0; i < QUDA_MAX_DIM; i++)
+          if (block_dim[i] != dd.block_dim[i]) return false;
+
+      return true;
+    }
+
+    inline bool operator!=(const DDParam &dd) const { return !(*this == dd); }
+  };
+
+} // namespace quda
diff --git a/include/domain_decomposition_helper.cuh b/include/domain_decomposition_helper.cuh
new file mode 100644
index 0000000000..2b7ab5acea
--- /dev/null
+++ b/include/domain_decomposition_helper.cuh
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <fast_intdiv.h>
+#include <color_spinor_field.h>
+
+namespace quda
+{
+
+  // No DD (use also as a template for required functions)
+  struct DDNo {
+
+    // Initialization of input parameters from ColorSpinorField
+    DDNo(const DDParam &dd)
+    {
+      if (dd.type != QUDA_DD_NO) { errorQuda("Unsupported type %d\n", dd.type); }
+    }
+
+    // Only DDNo returns true. All others return false
+    constexpr bool operator!() const { return true; }
+
+    // Whether comms are required along given direction
+    template <typename DDArg, typename Arg> constexpr bool commDim(int, const DDArg &, const Arg &) const
+    {
+      return true;
+    }
+
+    // Whether field at given coord is zero
+    template <typename Coord> constexpr bool isZero(const Coord &) const { return false; }
+
+    // Whether do hopping with field at neighboring coord
+    template <typename Coord> constexpr bool doHopping(const Coord &, int, int) const { return true; }
+  };
+
+  // Red-black Block DD
+  struct DDRedBlack {
+
+    const int_fastdiv block_dim[QUDA_MAX_DIM]; // the size of the block per direction
+    const bool red_active;         // if red blocks are active
+    const bool black_active;       // if black blocks are active
+    const bool block_hopping;      // if hopping between red and black is allowed
+
+    DDRedBlack(const DDParam &dd) :
+      block_dim {dd.block_dim[0], dd.block_dim[1], dd.block_dim[2], dd.block_dim[3]},
+      red_active(dd.type == QUDA_DD_NO or dd.is(DD::red_active)),
+      black_active(dd.type == QUDA_DD_NO or dd.is(DD::black_active)),
+      block_hopping(dd.type == QUDA_DD_NO or not dd.is(DD::no_block_hopping))
+    {
+      if (dd.type != QUDA_DD_NO and dd.type != QUDA_DD_RED_BLACK) { errorQuda("Unsupported type %d", dd.type); }
+    }
+
+    constexpr bool operator!() const { return false; }
+
+    // Whether comms are required along given direction
+    template <typename DDArg, typename Arg> constexpr bool commDim(int d, const DDArg &dd, const Arg &arg) const
+    {
+      if (not red_active and not black_active) return false;
+      if (not dd.red_active and not dd.black_active) return false;
+      if (arg.dim[d] % block_dim[d] == 0) {
+        if (not red_active and not dd.red_active) return false;
+        if (not black_active and not dd.black_active) return false;
+        if (not block_hopping and not dd.block_hopping) return false;
+      }
+      return true;
+    }
+
+    // Computes block_parity: 0 = red, 1 = black
+    template <typename Coord> constexpr bool block_parity(const Coord &x) const
+    {
+      int block_parity = 0;
+      for (int i = 0; i < x.size(); i++) {
+        if (block_dim[i] > 0) block_parity += x.gx[i] / block_dim[i];
+      }
+      return block_parity % 2 == 1;
+    }
+
+    template <typename Coord> constexpr bool on_border(const Coord &x, int mu, int dist) const
+    {
+      if (block_dim[mu] == 0) return false;
+      int x_mu = x.gx[mu] + dist;
+      if (x_mu < 0) x_mu += x.gDim[mu];
+      if (x_mu >= x.gDim[mu]) x_mu -= x.gDim[mu];
+      return x.gx[mu] / block_dim[mu] != x_mu / block_dim[mu];
+    }
+
+    template <typename Coord> constexpr bool isZero(const Coord &x) const
+    {
+      bool is_black = block_parity(x);
+      bool is_red = not is_black;
+
+      if (is_red and red_active) return false;
+      if (is_black and black_active) return false;
+      return true;
+    }
+
+    template <typename Coord> constexpr bool doHopping(const Coord &x, int mu, int dist) const
+    {
+      bool is_black = block_parity(x);
+      bool is_red = !is_black;
+      bool is_border = on_border(x, mu, dist);
+
+      if (!is_border) { // Within block
+        if (is_red and red_active) return true;
+        if (is_black and black_active) return true;
+      } else if (block_hopping) { // Between blocks
+        if (is_red and black_active) return true;
+        if (is_black and red_active) return true;
+      }
+      return false;
+    }
+  };
+
+} // namespace quda
diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh
index 6b4747f39e..4990abc333 100644
--- a/include/dslash_helper.cuh
+++ b/include/dslash_helper.cuh
@@ -11,6 +11,7 @@
 #include <shmem_pack_helper.cuh>
 #include <kernel_helper.h>
 #include <tune_quda.h>
+#include <domain_decomposition_helper.cuh>
 
 constexpr quda::use_kernel_arg_p use_kernel_arg = quda::use_kernel_arg_p::TRUE;
 
@@ -99,6 +100,7 @@ namespace quda
   {
     constexpr auto nDim = Arg::nDim;
     Coord<nDim> coord;
+    for (auto i = 0; i < nDim; i++) coord.gDim[i] = arg.gDim[i];
     dim = kernel_type; // keep compiler happy
 
     // only for 5-d checkerboarding where we need to include the fifth dimension
@@ -150,6 +152,7 @@ namespace quda
         coordsFromFaceIndex<nDim, pc_type, 3, nface_>(coord.X, coord.x_cb, coord, idx, face_num, parity, arg);
       }
     }
+    for (int i = 0; i < nDim; i++) { coord.gx[i] = arg.commCoord[i] + coord.x[i]; }
     coord.s = s;
     return coord;
   }
@@ -236,7 +239,7 @@ namespace quda
     return true;
   }
 
-  template <typename Float_, int nDim_, int n_src_tile_ = 1> struct DslashArg {
+  template <typename Float_, int nDim_, typename DDArg, int n_src_tile_ = 1> struct DslashArg {
 
     using Float = Float_;
     using real = typename mapper<Float>::type;
@@ -250,9 +253,13 @@ namespace quda
 
     const int_fastdiv X0h;
     const int_fastdiv dim[5]; // full lattice dimensions
+    const int gDim[5];        // global full lattice dimensions
     const int volumeCB;       // checkerboarded volume
     int commDim[4];           // whether a given dimension is partitioned or not (potentially overridden for Schwarz)
 
+    const int commCoord[5];
+    const int globalDim3;
+
     const bool dagger; // dagger
     const bool xpay;   // whether we are doing xpay or not
 
@@ -283,6 +290,10 @@ namespace quda
     int exterior_dims; // dimension to run in the exterior Dslash
     int exterior_blocks;
 
+    DDArg dd_out;
+    DDArg dd_in;
+    DDArg dd_x;
+
     // for shmem ...
     static constexpr bool packkernel = false;
     void *packBuffer[4 * QUDA_MAX_DIM];
@@ -316,7 +327,10 @@ namespace quda
       reconstruct(U.Reconstruct()),
       X0h(nParity == 2 ? in.X(0) / 2 : in.X(0)),
       dim {(3 - nParity) * in.X(0), in.X(1), in.X(2), in.X(3), in.Ndim() == 5 ? in.X(4) : 1},
+      gDim {comm_dim(0) * dim[0], comm_dim(1) * dim[1], comm_dim(2) * dim[2], comm_dim(3) * dim[3], dim[4]},
       volumeCB(in.VolumeCB()),
+      commCoord {comm_coord(0) * dim[0], comm_coord(1) * dim[1], comm_coord(2) * dim[2], comm_coord(3) * dim[3], dim[4]},
+      globalDim3(comm_dim(3) * this->dim[3]),
       dagger(dagger),
       xpay(xpay),
       kernel_type(INTERIOR_KERNEL),
@@ -336,6 +350,9 @@ namespace quda
       pack_blocks(0),
       exterior_dims(0),
       exterior_blocks(0),
+      dd_out(out.DD()),
+      dd_in(in.DD()),
+      dd_x(x.DD()),
 #ifndef NVSHMEM_COMMS
       counter(0)
 #else
@@ -354,10 +371,11 @@ namespace quda
         if (in[i].data() == out[i].data()) errorQuda("Aliasing pointers");
       checkOrder(out, in, x);        // check all orders match
       checkLocation(out, in, x, U);  // check all locations match
+      checkDD(out, in, x);           // check all DD match
       checkNative(in, U);
 
       for (int d = 0; d < 4; d++) {
-        commDim[d] = (comm_override[d] == 0) ? 0 : comm_dim_partitioned(d);
+        commDim[d] = (comm_override[d] == 0) ? 0 : (comm_dim_partitioned(d) * dd_out.commDim(d, dd_in, *this));
       }
 
       if (in.Location() == QUDA_CUDA_FIELD_LOCATION) {
@@ -413,7 +431,8 @@ namespace quda
     }
   };
 
-  template <typename Float, int nDim> std::ostream &operator<<(std::ostream &out, const DslashArg<Float, nDim> &arg)
+  template <typename Float, int nDim, typename DDArg>
+  std::ostream &operator<<(std::ostream &out, const DslashArg<Float, nDim, DDArg> &arg)
   {
     out << "parity = " << arg.parity << std::endl;
     out << "nParity = " << arg.nParity << std::endl;
diff --git a/include/enum_quda.h b/include/enum_quda.h
index 462ce98cf0..ed0cb1897b 100644
--- a/include/enum_quda.h
+++ b/include/enum_quda.h
@@ -372,16 +372,18 @@ typedef enum QudaFieldCreate_s {
   QUDA_INVALID_FIELD_CREATE = QUDA_INVALID_ENUM
 } QudaFieldCreate;
 
-typedef enum QudaGammaBasis_s {          // gamj=((top 2 rows)(bottom 2 rows))  s1,s2,s3 are Pauli spin matrices, 1 is 2x2 identity
-  QUDA_DEGRAND_ROSSI_GAMMA_BASIS,   // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((0,1)(1,0))  gam5=((-1,0)(0,1))
-  QUDA_UKQCD_GAMMA_BASIS,           // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0))
-  QUDA_CHIRAL_GAMMA_BASIS,          // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))gam5=((1,0)(0,-1))
-  QUDA_DIRAC_PAULI_GAMMA_BASIS,     // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,1)(1,0))
-  QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM      //  gam5=gam4*gam1*gam2*gam3
+typedef enum QudaGammaBasis_s { // gamj=((top 2 rows)(bottom 2 rows))  s1,s2,s3 are Pauli spin matrices, 1 is 2x2 identity
+  QUDA_DEGRAND_ROSSI_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,i*s3)(-i*s3,0))
+                                  // gam4=((0,1)(1,0))  gam5=((-1,0)(0,1))
+  QUDA_UKQCD_GAMMA_BASIS, // gam1=((0,i*s1)(-i*s1,0)) gam2=((0,i*s2)(-i*s2,0)) gam3=((0,i*s3)(-i*s3,0)) gam4=((1,0)(0,-1)) gam5=((0,-1)(-1,0))
+  QUDA_CHIRAL_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0)) gam4=((0,-1)(-1,0))gam5=((1,0)(0,-1))
+  QUDA_DIRAC_PAULI_GAMMA_BASIS, // gam1=((0,-i*s1)(i*s1,0)) gam2=((0,-i*s2)(i*s2,0)) gam3=((0,-i*s3)(i*s3,0))
+                                // gam4=((1,0)(0,-1)) gam5=((0,1)(1,0))
+  QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM //  gam5=gam4*gam1*gam2*gam3
 } QudaGammaBasis;
-                                      //  Dirac-Pauli -> DeGrand-Rossi   T = i/sqrt(2)*((s2,-s2)(s2,s2))     field_DR = T * field_DP
-                                      //  UKQCD -> DeGrand-Rossi         T = i/sqrt(2)*((-s2,-s2)(-s2,s2))   field_DR = T * field_UK
-                                      //  Chiral -> DeGrand-Rossi        T = i*((0,-s2)(s2,0))               field_DR = T * field_chiral
+//  Dirac-Pauli -> DeGrand-Rossi   T = i/sqrt(2)*((s2,-s2)(s2,s2))     field_DR = T * field_DP
+//  UKQCD -> DeGrand-Rossi         T = i/sqrt(2)*((-s2,-s2)(-s2,s2))   field_DR = T * field_UK
+//  Chiral -> DeGrand-Rossi        T = i*((0,-s2)(s2,0))               field_DR = T * field_chiral
 typedef enum QudaSourceType_s {
   QUDA_POINT_SOURCE,
   QUDA_RANDOM_SOURCE,
@@ -636,6 +638,8 @@ typedef enum QudaExtLibType_s {
   QUDA_EXTLIB_INVALID = QUDA_INVALID_ENUM
 } QudaExtLibType;
 
+typedef enum QudaDDType_s { QUDA_DD_NO, QUDA_DD_RED_BLACK, QUDA_DD_INVALID = QUDA_INVALID_ENUM } QudaDDType;
+
 typedef enum QudaWFlowStepType_s {
   WFLOW_STEP_W1,
   WFLOW_STEP_W2,
diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h
index 8874959d7b..faf68bf914 100644
--- a/include/enum_quda_fortran.h
+++ b/include/enum_quda_fortran.h
@@ -544,3 +544,8 @@
 #define QUDA_CUSOLVE_EXTLIB 0
 #define QUDA_EIGEN_EXTLIB 1
 #define QUDA_EXTLIB_INVALID QUDA_INVALID_ENUM
+
+#define QudaDDType integer(4)
+#define QUDA_DD_NO 0
+#define QUDA_DD_RED_BLACK 1
+#define QUDA_DD_INVALID QUDA_INVALID_ENUM
diff --git a/include/index_helper.cuh b/include/index_helper.cuh
index cf1faf72b0..2f8a4a3d91 100644
--- a/include/index_helper.cuh
+++ b/include/index_helper.cuh
@@ -230,12 +230,15 @@ namespace quda {
 
   template <int nDim>
   struct Coord {
-    int x[nDim]; // nDim lattice coordinates
+    array<int, nDim> x = {};    // nDim lattice coordinates
+    array<int, nDim> gx = {};   // nDim global lattice coordinates
+    array<int, nDim> gDim = {}; // global lattice dimensions
     int x_cb;    // checkerboard lattice site index
     int s;       // fifth dimension coord
     int X;       // full lattice site index
     constexpr const int& operator[](int i) const { return x[i]; }
     constexpr int& operator[](int i) { return x[i]; }
+    constexpr int size() const { return nDim; }
   };
 
   /**
diff --git a/include/instantiate.h b/include/instantiate.h
index 8eee6ad269..9b40ede127 100644
--- a/include/instantiate.h
+++ b/include/instantiate.h
@@ -84,6 +84,24 @@ namespace quda
     }
   }
 
+  /**
+     @brief precision_type_mapper Struct used to convert QudaPrecision to data-type.
+  */
+  template <QudaPrecision precision> struct precision_type_mapper {
+  };
+  template <> struct precision_type_mapper<QUDA_DOUBLE_PRECISION> {
+    using type = double;
+  };
+  template <> struct precision_type_mapper<QUDA_SINGLE_PRECISION> {
+    using type = float;
+  };
+  template <> struct precision_type_mapper<QUDA_HALF_PRECISION> {
+    using type = short;
+  };
+  template <> struct precision_type_mapper<QUDA_QUARTER_PRECISION> {
+    using type = int8_t;
+  };
+
   /**
      @brief Helper function for returning if a given reconstruct is enabled
      @tparam reconstruct The reconstruct requested
@@ -97,6 +115,20 @@ namespace quda
   template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_8>() { return (QUDA_RECONSTRUCT & 1) ? true : false; }
   template <> constexpr bool is_enabled<QUDA_RECONSTRUCT_10>() { return true; }
 
+  /**
+     @brief Helper function for returning if a given domain decomposition is enabled
+     @tparam DD The domain decomposition requested
+     @return True if enabled, false if not
+  */
+  constexpr bool is_enabled(QudaDDType DD)
+  {
+    switch (DD) {
+    case QUDA_DD_NO: return true;
+    case QUDA_DD_RED_BLACK: return (QUDA_DOMAIN_DECOMPOSITION & 1) ? true : false;
+    default: return false;
+    }
+  }
+
   struct ReconstructFull {
     static constexpr std::array<QudaReconstructType, 6> recon
       = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_9, QUDA_RECONSTRUCT_8, QUDA_RECONSTRUCT_10};
diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h
index eab0ead243..587f36e7ee 100644
--- a/include/instantiate_dslash.h
+++ b/include/instantiate_dslash.h
@@ -5,6 +5,7 @@
 #include <color_spinor_field.h>
 #include <gauge_field.h>
 #include <instantiate.h>
+#include <domain_decomposition_helper.cuh>
 
 namespace quda
 {
@@ -16,24 +17,24 @@ namespace quda
      @param[in] U Gauge field
      @param[in] args Additional arguments for different dslash kernels
   */
-  template <template <typename, int, QudaReconstructType> class Apply, typename Recon, typename Float, int nColor,
-            typename... Args>
+  template <template <typename, int, typename, QudaReconstructType> class Apply, typename Recon, typename Float,
+            int nColor, typename DDArg, typename... Args>
   void instantiate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, Args &&...args)
   {
     if (U.Reconstruct() == Recon::recon[0]) {
       if constexpr (is_enabled<QUDA_RECONSTRUCT_NO>())
-        Apply<Float, nColor, Recon::recon[0]>(out, in, x, U, args...);
+        Apply<Float, nColor, DDArg, Recon::recon[0]>(out, in, x, U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-18", QUDA_RECONSTRUCT);
     } else if (U.Reconstruct() == Recon::recon[1]) {
       if constexpr (is_enabled<QUDA_RECONSTRUCT_12>())
-        Apply<Float, nColor, Recon::recon[1]>(out, in, x, U, args...);
+        Apply<Float, nColor, DDArg, Recon::recon[1]>(out, in, x, U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-12/13", QUDA_RECONSTRUCT);
     } else if (U.Reconstruct() == Recon::recon[2]) {
       if constexpr (is_enabled<QUDA_RECONSTRUCT_8>())
-        Apply<Float, nColor, Recon::recon[2]>(out, in, x, U, args...);
+        Apply<Float, nColor, DDArg, Recon::recon[2]>(out, in, x, U, args...);
       else
         errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8/9", QUDA_RECONSTRUCT);
     } else {
@@ -41,6 +42,30 @@ namespace quda
     }
   }
 
+  /**
+     @brief This instantiate function is used to instantiate the domain decomposition type
+     @param[out] out Output result field
+     @param[in] in Input field
+     @param[in] U Gauge field
+     @param[in] args Additional arguments for different dslash kernels
+  */
+  template <template <typename, int, typename, QudaReconstructType> class Apply, typename Recon, typename Float,
+            int nColor, typename... Args>
+  inline void instantiate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                          cvector_ref<const ColorSpinorField> &x, const GaugeField &U, Args &&...args)
+  {
+    if (out.DD().type == QUDA_DD_NO and in.DD().type == QUDA_DD_NO) {
+      instantiate<Apply, Recon, Float, 3, DDNo>(out, in, x, U, args...);
+    } else if (out.DD().type == QUDA_DD_RED_BLACK or in.DD().type == QUDA_DD_RED_BLACK) {
+      if constexpr (is_enabled(QUDA_DD_RED_BLACK))
+        instantiate<Apply, Recon, Float, 3, DDRedBlack>(out, in, x, U, args...);
+      else
+        errorQuda("QUDA_DOMAIN_DECOMPOSITION=%d does not enable RedBlack", QUDA_DOMAIN_DECOMPOSITION);
+    } else {
+      errorQuda("Unsupported DD type %d\n", out.DD().type);
+    }
+  }
+
   /**
      @brief This instantiate function is used to instantiate the colors
      @param[out] out Output result field
@@ -48,7 +73,7 @@ namespace quda
      @param[in] U Gauge field
      @param[in] args Additional arguments for different dslash kernels
   */
-  template <template <typename, int, QudaReconstructType> class Apply, typename Recon, typename Float, typename... Args>
+  template <template <typename, int, typename, QudaReconstructType> class Apply, typename Recon, typename Float, typename... Args>
   void instantiate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, Args &&...args)
   {
@@ -66,7 +91,8 @@ namespace quda
      @param[in] U Gauge field
      @param[in] args Additional arguments for different dslash kernels
   */
-  template <template <typename, int, QudaReconstructType> class Apply, typename Recon = ReconstructWilson, typename... Args>
+  template <template <typename, int, typename, QudaReconstructType> class Apply, typename Recon = ReconstructWilson,
+            typename... Args>
   void instantiate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, Args &&...args)
   {
@@ -104,7 +130,8 @@ namespace quda
      @param[in] U Gauge field
      @param[in] args Additional arguments for different dslash kernels
   */
-  template <template <typename, int, QudaReconstructType> class Apply, typename Recon = ReconstructWilson, typename... Args>
+  template <template <typename, int, typename, QudaReconstructType> class Apply, typename Recon = ReconstructWilson,
+            typename... Args>
   void instantiatePreconditioner(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                                  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, Args &&...args)
   {
diff --git a/include/invert_quda.h b/include/invert_quda.h
index d9d5279066..a650766ece 100644
--- a/include/invert_quda.h
+++ b/include/invert_quda.h
@@ -222,6 +222,10 @@ namespace quda {
     /** Whether to use additive or multiplicative Schwarz preconditioning */
     QudaSchwarzType schwarz_type = QUDA_INVALID_SCHWARZ;
 
+    /** The size of a block per direction in the Schwarz procedure
+        (default = 0, i.e. local volume, old implementation) */
+    int schwarz_block[QUDA_MAX_DIM] = {0};
+
     /** The type of accelerator type to use for preconditioner */
     QudaAcceleratorType accelerator_type_precondition = QUDA_INVALID_ACCELERATOR;
 
@@ -364,6 +368,14 @@ namespace quda {
 
     // for incremental eigCG:
     void updateRhsIndex(QudaInvertParam &param) { rhs_idx = param.rhs_idx; }
+
+    inline bool do_block_schwarz() const
+    {
+      if (schwarz_type == QUDA_INVALID_SCHWARZ) return false;
+      for (int i = 0; i < QUDA_MAX_DIM; i++)
+        if (schwarz_block[i] > 0) return true;
+      return false;
+    }
   };
 
   class Solver {
diff --git a/include/kernels/color_spinor_project_dd.cuh b/include/kernels/color_spinor_project_dd.cuh
new file mode 100644
index 0000000000..064c0b90cb
--- /dev/null
+++ b/include/kernels/color_spinor_project_dd.cuh
@@ -0,0 +1,55 @@
+#include <color_spinor_field_order.h>
+#include <dslash_helper.cuh>
+#include <kernel.h>
+
+namespace quda
+{
+
+  using namespace colorspinor;
+
+  template <typename Float, typename DDArg, int nSpin_, int nColor_, typename Order>
+  struct ProjectDDArg : kernel_param<> {
+    using real = typename mapper<Float>::type;
+    static constexpr int nSpin = nSpin_;
+    static constexpr int nColor = nColor_;
+    Order out;
+    const DDArg dd;
+    const int parity;
+    const int nParity; // number of parities we're working on
+    const int_fastdiv X0h;
+    const int_fastdiv dim[5]; // full lattice dimensions
+    const int commCoord[5];
+
+    ProjectDDArg(ColorSpinorField &out) :
+      kernel_param(dim3(out.VolumeCB(), out.SiteSubset(), 1)),
+      out(out),
+      dd(out.DD()),
+      parity(out.SiteOrder() == QUDA_ODD_EVEN_SITE_ORDER ? 1 : 0),
+      nParity(out.SiteSubset()),
+      X0h(nParity == 2 ? out.X(0) / 2 : out.X(0)),
+      dim {out.full_dim(0), out.full_dim(1), out.full_dim(2), out.full_dim(3), out.Ndim() == 5 ? out.full_dim(4) : 1},
+      commCoord {comm_coord(0) * dim[0], comm_coord(1) * dim[1], comm_coord(2) * dim[2], comm_coord(3) * dim[3],
+                 comm_coord(4) * dim[4]}
+
+    {
+    }
+  };
+
+  template <typename Arg> struct ProjectDD_ {
+    const Arg &arg;
+    constexpr ProjectDD_(const Arg &arg) : arg(arg) { }
+    static constexpr const char *filename() { return KERNEL_FILE; }
+
+    __device__ __host__ inline void operator()(int x_cb, int parity)
+    {
+      Coord<4> coord;
+      coord.X = getCoordsCB(coord, x_cb, arg.dim, arg.X0h, parity);
+      for (int i = 0; i < coord.size(); i++) { coord.gx[i] = arg.commCoord[i] + coord.x[i]; }
+      if (arg.dd.isZero(coord)) {
+        ColorSpinor<typename Arg::real, Arg::nColor, Arg::nSpin> zero;
+        arg.out(x_cb, (parity + arg.parity) & 1) = zero;
+      }
+    }
+  };
+
+} // namespace quda
diff --git a/include/kernels/covariant_derivative.cuh b/include/kernels/covariant_derivative.cuh
index 8a9a18f30a..f58e1b1ff7 100644
--- a/include/kernels/covariant_derivative.cuh
+++ b/include/kernels/covariant_derivative.cuh
@@ -14,8 +14,8 @@ namespace quda
   /**
      @brief Parameter structure for driving the covariant derivative operator
   */
-  template <typename Float, int nSpin_, int nColor_, QudaReconstructType reconstruct_, int nDim>
-  struct CovDevArg : DslashArg<Float, nDim> {
+  template <typename Float, int nSpin_, int nColor_, typename DDArg, QudaReconstructType reconstruct_, int nDim>
+  struct CovDevArg : DslashArg<Float, nDim, DDArg> {
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = nSpin_;
     static constexpr bool spin_project = false;
@@ -36,12 +36,12 @@ namespace quda
     F in[MAX_MULTI_RHS];   /** input vector field */
     const Ghost halo_pack; /** accessor for writing the halo field */
     const Ghost halo;      /** accessor for reading the halo field */
-    const G U;  /** the gauge field */
-    int mu;     /** The direction in which to apply the derivative */
+    const G U;             /** the gauge field */
+    int mu;                /** The direction in which to apply the derivative */
 
     CovDevArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
               const GaugeField &U, int mu, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, in, parity, dagger, false, 1, spin_project, comm_override),
+      DslashArg<Float, nDim, DDArg>(out, in, halo, U, in, parity, dagger, false, 1, spin_project, comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
@@ -77,7 +77,8 @@ namespace quda
 
     const int d = mu % 4;
 
-    if (mu < 4) { // Forward gather - compute fwd offset for vector fetch
+    if (mu < 4 and arg.dd_in.doHopping(coord, d, +1)) {
+      // Forward gather - compute fwd offset for vector fetch
 
       const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc);
       const bool ghost = (coord[d] + 1 >= arg.dim[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
@@ -96,7 +97,8 @@ namespace quda
         out += U * in;
       }
 
-    } else { // Backward gather - compute back offset for spinor and gauge fetch
+    } else if (mu >= 4 and arg.dd_in.doHopping(coord, d, -1)) {
+      // Backward gather - compute back offset for spinor and gauge fetch
 
       const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
       const int gauge_idx = back_idx;
@@ -124,7 +126,7 @@ namespace quda
   template <int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg> struct covDev : dslash_default {
 
     const Arg &arg;
-    constexpr covDev(const Arg &arg) : arg(arg) {}
+    constexpr covDev(const Arg &arg) : arg(arg) { }
     static constexpr const char *filename() { return KERNEL_FILE; } // this file name - used for run-time compilation
 
     template <KernelType mykernel_type = kernel_type>
@@ -144,6 +146,11 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
 
+      if (arg.dd_x.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       switch (arg.mu) { // ensure that mu is known to compiler for indexing in applyCovDev (avoid register spillage)
       case 0:
         applyCovDev<nParity, dagger, mykernel_type, 0>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
diff --git a/include/kernels/dslash_domain_wall_4d.cuh b/include/kernels/dslash_domain_wall_4d.cuh
index 4fbb511230..b04d488e01 100644
--- a/include/kernels/dslash_domain_wall_4d.cuh
+++ b/include/kernels/dslash_domain_wall_4d.cuh
@@ -5,8 +5,8 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-  struct DomainWall4DArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct DomainWall4DArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
     typedef typename mapper<Float>::type real;
     int_fastdiv Ls;                     /** fifth dimension length */
     complex<real> a_5[QUDA_MAX_DWF_LS]; /** xpay scale factor for each 4-d subvolume */
@@ -15,7 +15,8 @@ namespace quda
                     const ColorSpinorField &halo, const GaugeField &U, double a, double m_5, const Complex *b_5,
                     const Complex *c_5, bool xpay, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                     const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, xpay ? a : 0.0, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, xpay ? a : 0.0, x, parity, dagger,
+                                                          comm_override),
       Ls(in.X(4))
     {
       if (b_5 == nullptr || c_5 == nullptr)
@@ -48,10 +49,18 @@ namespace quda
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
-      applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
       int xs = coord.x_cb + s * arg.dc.volume_4d_cb;
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](xs, my_spinor_parity) = out;
+        return;
+      }
+
+      applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+
+      if (xpay && mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = arg.a_5[s] * out;
+      } else if (xpay && mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](xs, my_spinor_parity);
         out = x + arg.a_5[s] * out;
       } else if (mykernel_type != INTERIOR_KERNEL && active) {
diff --git a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
index 06ef52324b..9631c667ad 100644
--- a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
+++ b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh
@@ -7,15 +7,15 @@
 namespace quda
 {
 
-  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_, Dslash5Type dslash5_type_>
-  struct DomainWall4DFusedM5Arg : DomainWall4DArg<Float, nColor_, nDim, reconstruct_>,
+  template <typename Float, int nColor_, int nDim, typename DDArg, QudaReconstructType reconstruct_, Dslash5Type dslash5_type_>
+  struct DomainWall4DFusedM5Arg : DomainWall4DArg<Float, nColor_, nDim, DDArg, reconstruct_>,
                                   Dslash5Arg<Float, nColor_, false, false, dslash5_type_> {
     // ^^^ Note that for Dslash5Arg we have xpay == dagger == false. This is because the xpay and dagger are determined
     // by fused kernel, not the dslash5, so the `false, false` here are simply dummy instantiations.
 
     static constexpr int nColor = nColor_;
 
-    using DomainWall4DArg = DomainWall4DArg<Float, nColor, nDim, reconstruct_>;
+    using DomainWall4DArg = DomainWall4DArg<Float, nColor, nDim, DDArg, reconstruct_>;
     using DomainWall4DArg::a_5;
     using DomainWall4DArg::dagger;
     using DomainWall4DArg::in;
diff --git a/include/kernels/dslash_domain_wall_5d.cuh b/include/kernels/dslash_domain_wall_5d.cuh
index 3737bacb5c..27a5ce4eff 100644
--- a/include/kernels/dslash_domain_wall_5d.cuh
+++ b/include/kernels/dslash_domain_wall_5d.cuh
@@ -7,8 +7,8 @@ namespace quda
 
   // fixme: fused kernel (thread dim mappers set after construction?) and xpay
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-  struct DomainWall5DArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct DomainWall5DArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
     typedef typename mapper<Float>::type real;
     int_fastdiv Ls; /** fifth dimension length */
     real a;   /** xpay scale factor */
@@ -17,14 +17,15 @@ namespace quda
     DomainWall5DArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                     const ColorSpinorField &halo, const GaugeField &U, double a, double m_f, bool xpay,
                     cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, xpay ? a : 0.0, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, xpay ? a : 0.0, x, parity, dagger,
+                                                          comm_override),
       Ls(in.X(4)),
       a(a),
       m_f(m_f)
     {
       // remove the batch dimension from these constants, since these are used for 5-d checkerboard indexing
-      DslashArg<Float, nDim>::dc.X[4] = in.X(4);
-      DslashArg<Float, nDim>::dc.X5X4X3X2X1mX4X3X2X1 = (in.X(4) - 1) * DslashArg<Float, nDim>::dc.X4X3X2X1;
+      DslashArg<Float, nDim, DDArg>::dc.X[4] = in.X(4);
+      DslashArg<Float, nDim, DDArg>::dc.X5X4X3X2X1mX4X3X2X1 = (in.X(4) - 1) * DslashArg<Float, nDim, DDArg>::dc.X4X3X2X1;
     }
   };
 
@@ -56,13 +57,18 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
 
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
       if (mykernel_type == INTERIOR_KERNEL) { // 5th dimension derivative always local
         constexpr int d = 4;
         const int s = coord[4];
         const int their_spinor_parity = nParity == 2 ? 1 - parity : 0;
-        {
+        if (arg.dd_in.doHopping(coord, d, +1)) {
           const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc);
           constexpr int proj_dir = dagger ? +1 : -1;
           Vector in = arg.in[src_idx](fwd_idx, their_spinor_parity);
@@ -73,7 +79,7 @@ namespace quda
           }
         }
 
-        {
+        if (arg.dd_in.doHopping(coord, d, -1)) {
           const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
           constexpr int proj_dir = dagger ? -1 : +1;
           Vector in = arg.in[src_idx](back_idx, their_spinor_parity);
@@ -85,7 +91,9 @@ namespace quda
         }
       }
 
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (xpay && mykernel_type == INTERIOR_KERNEL and arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (xpay && mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
         out = x + arg.a * out;
       } else if (mykernel_type != INTERIOR_KERNEL && active) {
diff --git a/include/kernels/dslash_ndeg_twisted_clover.cuh b/include/kernels/dslash_ndeg_twisted_clover.cuh
index fdc4c5cb99..b5961ade89 100644
--- a/include/kernels/dslash_ndeg_twisted_clover.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover.cuh
@@ -6,11 +6,11 @@
 
 namespace quda
 {
-  
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-    struct NdegTwistedCloverArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
-    
-    using WilsonArg<Float, nColor, nDim, reconstruct_>::nSpin;
+
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct NdegTwistedCloverArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
+
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     typedef typename clover_mapper<Float, length, true>::type C;
     typedef typename mapper<Float>::type real;
@@ -24,7 +24,7 @@ namespace quda
                          const ColorSpinorField &halo, const GaugeField &U, const CloverField &A, double a, double b,
                          double c, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                          const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
       A(A, false),
       a(a),
       // if dagger flip the chiral twist
@@ -34,9 +34,9 @@ namespace quda
     {
       checkPrecision(U, A);
       checkLocation(U, A);
-      }
+    }
   };
-  
+
   template <int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
     struct nDegTwistedClover : dslash_default {
     
@@ -68,11 +68,17 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       const int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
-      
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+        return;
+      }
+
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      if (mykernel_type == INTERIOR_KERNEL) {
+      if (mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (mykernel_type == INTERIOR_KERNEL) {
         // apply the chiral and flavor twists
         // use consistent load order across s to ensure better cache locality
         Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
diff --git a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
index 01bcfd7088..70d5d6635d 100644
--- a/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_clover_preconditioned.cuh
@@ -7,10 +7,10 @@
 
 namespace quda
 {
-  
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-    struct NdegTwistedCloverPreconditionedArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
-    using WilsonArg<Float, nColor, nDim, reconstruct_>::nSpin;
+
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct NdegTwistedCloverPreconditionedArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     static constexpr bool dynamic_clover = clover::dynamic_inverse();
     
@@ -27,7 +27,8 @@ namespace quda
                                        const ColorSpinorField &halo, const GaugeField &U, const CloverField &A,
                                        double a, double b, double c, bool xpay, cvector_ref<const ColorSpinorField> &x,
                                        int parity, bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger,
+                                                          comm_override),
       A(A, false),
       A2inv(A, dynamic_clover ? false : true), // if dynamic clover we don't want the inverse field
       a(a),
@@ -37,7 +38,7 @@ namespace quda
     {
       checkPrecision(U, A);
       checkLocation(U, A);
-      }
+    }
   };
 
   template <int nParity, bool dagger, bool xpay, KernelType kernel_type, typename Arg>
@@ -70,13 +71,16 @@ namespace quda
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, flavor, parity, thread_dim);
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
+      int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+        return;
+      }
 
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
-
       if (mykernel_type != INTERIOR_KERNEL && active) {
         // if we're not the interior kernel, then we must sum the partial
         Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
@@ -142,7 +146,7 @@ namespace quda
         Vector tmp = out_chi[0].chiral_reconstruct(0) + out_chi[1].chiral_reconstruct(1);
         tmp.toNonRel(); // switch back to non-chiral basis
 
-        if (xpay) {
+        if (xpay and not arg.dd_x.isZero(coord)) {
           Vector x = arg.x[src_idx](my_flavor_idx, my_spinor_parity);
           out = x + arg.a * tmp;
         } else {
diff --git a/include/kernels/dslash_ndeg_twisted_mass.cuh b/include/kernels/dslash_ndeg_twisted_mass.cuh
index e4df183846..1a91e59643 100644
--- a/include/kernels/dslash_ndeg_twisted_mass.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass.cuh
@@ -5,8 +5,8 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-  struct NdegTwistedMassArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct NdegTwistedMassArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
     typedef typename mapper<Float>::type real;
     real a; /** this is the Wilson-dslash scale factor */
     real b; /** this is the chiral twist factor */
@@ -15,7 +15,7 @@ namespace quda
     NdegTwistedMassArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                        const ColorSpinorField &halo, const GaugeField &U, double a, double b, double c,
                        cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
       a(a),
       b(dagger ? -b : b), // if dagger flip the chiral twist
       c(c)
@@ -50,14 +50,19 @@ namespace quda
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, flavor, parity, thread_dim);
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
+      int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+        return;
+      }
 
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
-
-      if (mykernel_type == INTERIOR_KERNEL) {
+      if (mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (mykernel_type == INTERIOR_KERNEL) {
         // apply the chiral and flavor twists
         // use consistent load order across s to ensure better cache locality
         Vector x0 = arg.x[src_idx](coord.x_cb + 0 * arg.dc.volume_4d_cb, my_spinor_parity);
diff --git a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
index 4bdb432039..e830aeaaaa 100644
--- a/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_ndeg_twisted_mass_preconditioned.cuh
@@ -7,8 +7,8 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_, bool asymmetric_>
-  struct NdegTwistedMassArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_, bool asymmetric_>
+  struct NdegTwistedMassArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
     typedef typename mapper<Float>::type real;
     static constexpr bool asymmetric = asymmetric_; /** whether we are applying the asymetric operator or not */
     real a;          /** this is the Wilson-dslash scale factor */
@@ -21,7 +21,8 @@ namespace quda
     NdegTwistedMassArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                        const ColorSpinorField &halo, const GaugeField &U, double a, double b, double c, bool xpay,
                        cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger,
+                                                          comm_override),
       a(a),
       b(dagger ? -b : b), // if dagger flip the chiral twist
       c(c),
@@ -31,9 +32,9 @@ namespace quda
     {
       // set parameters for twisting in the packing kernel
       if (dagger && !asymmetric) {
-        DslashArg<Float, nDim>::twist_a = this->a;
-        DslashArg<Float, nDim>::twist_b = this->b;
-        DslashArg<Float, nDim>::twist_c = this->c;
+        DslashArg<Float, nDim, DDArg>::twist_a = this->a;
+        DslashArg<Float, nDim, DDArg>::twist_b = this->b;
+        DslashArg<Float, nDim, DDArg>::twist_c = this->c;
       }
     }
   };
@@ -68,16 +69,19 @@ namespace quda
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, flavor, parity, thread_dim);
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
+      int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](my_flavor_idx, my_spinor_parity) = out;
+        return;
+      }
 
       if (!dagger || Arg::asymmetric) // defined in dslash_wilson.cuh
         applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
       else // defined in dslash_twisted_mass_preconditioned
         applyWilsonTM<nParity, dagger, 2, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      int my_flavor_idx = coord.x_cb + flavor * arg.dc.volume_4d_cb;
-
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (xpay && mykernel_type == INTERIOR_KERNEL && not arg.dd_x.isZero(coord)) {
 
         if (!dagger || Arg::asymmetric) { // apply inverse twist which is undone below
           // use consistent load order across s to ensure better cache locality
@@ -97,7 +101,7 @@ namespace quda
         Vector x = arg.out[src_idx](my_flavor_idx, my_spinor_parity);
         out += x;
       }
-      
+
       if (!dagger || Arg::asymmetric) { // apply A^{-1} to D*in
         SharedMemoryCache<Vector> cache;
         if (isComplete<mykernel_type>(arg, coord) && active) {
diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh
index ec09ab1622..d531a8a00d 100644
--- a/include/kernels/dslash_staggered.cuh
+++ b/include/kernels/dslash_staggered.cuh
@@ -14,9 +14,9 @@ namespace quda
   /**
      @brief Parameter structure for driving the Staggered Dslash operator
   */
-  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_u_, QudaReconstructType reconstruct_l_,
-            bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC, int n_src_tile = MAX_MULTI_RHS_TILE>
-  struct StaggeredArg : DslashArg<Float, nDim, n_src_tile> {
+  template <typename Float, int nColor_, int nDim, typename DDArg, QudaReconstructType reconstruct_u_,
+            QudaReconstructType reconstruct_l_, bool improved_, QudaStaggeredPhase phase_ = QUDA_STAGGERED_PHASE_MILC, int n_src_tile = MAX_MULTI_RHS_TILE>
+  struct StaggeredArg : DslashArg<Float, nDim, DDArg, n_src_tile> {
     typedef typename mapper<Float>::type real;
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 1;
@@ -56,8 +56,8 @@ namespace quda
     StaggeredArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                  const ColorSpinorField &halo, const GaugeField &U, const GaugeField &L, double a,
                  cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim, n_src_tile>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true,
-                                         improved_ ? 3 : 1, spin_project, comm_override),
+      DslashArg<Float, nDim, DDArg, n_src_tile>(out, in, halo, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1,
+                                    spin_project, comm_override),
       halo_pack(halo, improved_ ? 3 : 1),
       halo(halo, improved_ ? 3 : 1),
       U(U),
@@ -98,7 +98,7 @@ namespace quda
     for (int d = 0; d < 4; d++) { // loop over dimension
 
       // standard - forward direction
-      {
+      if (arg.dd_in.doHopping(coord, d, +1)) {
         const bool ghost = (coord[d] + 1 >= arg.dim[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, 1);
@@ -121,7 +121,7 @@ namespace quda
       }
 
       // improved - forward direction
-      if (arg.improved) {
+      if (arg.improved and arg.dd_in.doHopping(coord, d, +3)) {
         const bool ghost = (coord[d] + 3 >= arg.dim[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           const int ghost_idx = ghostFaceIndexStaggered<1>(coord, arg.dim, d, arg.nFace);
@@ -143,7 +143,7 @@ namespace quda
         }
       }
 
-      {
+      if (arg.dd_in.doHopping(coord, d, -1)) {
         // Backward gather - compute back offset for spinor and gauge fetch
         const bool ghost = (coord[d] - 1 < 0) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
 
@@ -172,7 +172,7 @@ namespace quda
       }
 
       // improved - backward direction
-      if (arg.improved) {
+      if (arg.improved and arg.dd_in.doHopping(coord, d, -3)) {
         const bool ghost = (coord[d] - 3 < 0) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
         if (doHalo<kernel_type>(d) && ghost) {
           // when updating replace arg.nFace with 1 here
@@ -221,12 +221,18 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
       array<Vector, n_src_tile> out;
-      applyStaggered<nParity, mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+      if (arg.dd_out.isZero(coord)) {
+	if (mykernel_type != EXTERIOR_KERNEL_ALL || active) applyStaggered<nParity, mykernel_type, n_src_tile>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
+	return; 
+      }
 
 #pragma unroll
       for (auto s = 0; s < n_src_tile; s++) out[s] *= arg.dagger_scale;
 
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (xpay && mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = -out;
+      }
+      else if (xpay && mykernel_type == INTERIOR_KERNEL) {
 #pragma unroll
         for (auto s = 0; s < n_src_tile; s++) {
           Vector x = arg.x[src_idx + s](coord.x_cb, my_spinor_parity);
diff --git a/include/kernels/dslash_twisted_clover_preconditioned.cuh b/include/kernels/dslash_twisted_clover_preconditioned.cuh
index 99db81ee60..e3e86ee84a 100644
--- a/include/kernels/dslash_twisted_clover_preconditioned.cuh
+++ b/include/kernels/dslash_twisted_clover_preconditioned.cuh
@@ -7,9 +7,9 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-  struct TwistedCloverArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
-    using WilsonArg<Float, nColor, nDim, reconstruct_>::nSpin;
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct TwistedCloverArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     static constexpr bool dynamic_clover = clover::dynamic_inverse();
 
@@ -25,7 +25,8 @@ namespace quda
                      const ColorSpinorField &halo, const GaugeField &U, const CloverField &A, double a, double b,
                      bool xpay, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                      const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger,
+                                                          comm_override),
       A(A, false),
       A2inv(A, dynamic_clover ? false : true), // if dynamic clover we don't want the inverse field
       a(a),
@@ -66,6 +67,10 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
 
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
@@ -103,7 +108,7 @@ namespace quda
 
         tmp.toNonRel(); // switch back to non-chiral basis
 
-        if (xpay) {
+        if (xpay && not arg.dd_x.isZero(coord)) {
           Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
           out = x + arg.a * tmp;
         } else {
diff --git a/include/kernels/dslash_twisted_mass.cuh b/include/kernels/dslash_twisted_mass.cuh
index bed5cf9369..b202fabb01 100644
--- a/include/kernels/dslash_twisted_mass.cuh
+++ b/include/kernels/dslash_twisted_mass.cuh
@@ -5,8 +5,8 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-  struct TwistedMassArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct TwistedMassArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
     typedef typename mapper<Float>::type real;
     real a; /** xpay scale facotor */
     real b; /** this is the twist factor */
@@ -14,7 +14,7 @@ namespace quda
     TwistedMassArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    const ColorSpinorField &halo, const GaugeField &U, double a, double b,
                    cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
       a(a),
       b(dagger ? -b : b) // if dagger flip the twist
     {
@@ -47,10 +47,17 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
 
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      if (mykernel_type == INTERIOR_KERNEL) {
+      if (mykernel_type == INTERIOR_KERNEL and arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
         x += arg.b * x.igamma(4);
         out = x + arg.a * out;
diff --git a/include/kernels/dslash_twisted_mass_preconditioned.cuh b/include/kernels/dslash_twisted_mass_preconditioned.cuh
index 3fc7e9d42d..6577848d2d 100644
--- a/include/kernels/dslash_twisted_mass_preconditioned.cuh
+++ b/include/kernels/dslash_twisted_mass_preconditioned.cuh
@@ -5,8 +5,8 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_, bool asymmetric_>
-  struct TwistedMassArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_, bool asymmetric_>
+  struct TwistedMassArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
     typedef typename mapper<Float>::type real;
     static constexpr bool asymmetric = asymmetric_; /** whether we are applying the asymmetric operator or not */
     real a;          /** this is the scaling factor */
@@ -18,7 +18,8 @@ namespace quda
     TwistedMassArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    const ColorSpinorField &halo, const GaugeField &U, double a, double b, bool xpay,
                    cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, xpay ? 1.0 : 0.0, x, parity, dagger,
+                                                          comm_override),
       a(a),
       b(dagger ? -b : b), // if dagger flip the twist
       c(0.0),
@@ -27,8 +28,8 @@ namespace quda
     {
       // set parameters for twisting in the packing kernel
       if (dagger && !asymmetric) {
-        DslashArg<Float, nDim>::twist_a = this->a;
-        DslashArg<Float, nDim>::twist_b = this->b;
+        DslashArg<Float, nDim, DDArg>::twist_a = this->a;
+        DslashArg<Float, nDim, DDArg>::twist_b = this->b;
       }
     }
   };
@@ -58,7 +59,8 @@ namespace quda
 
 #pragma unroll
     for (int d = 0; d < Arg::nDim; d++) { // loop over dimension
-      {                              // Forward gather - compute fwd offset for vector fetch
+      // Forward gather - compute fwd offset for vector fetch
+      if (arg.dd_in.doHopping(coord, d, +1)) {
         const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc);
         constexpr int proj_dir = dagger ? +1 : -1;
         const bool ghost
@@ -95,7 +97,8 @@ namespace quda
         }
       }
 
-      { // Backward gather - compute back offset for spinor and gauge fetch
+      // Backward gather - compute back offset for spinor and gauge fetch
+      if (arg.dd_in.doHopping(coord, d, -1)) {
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
         const int gauge_idx = back_idx;
         constexpr int proj_dir = dagger ? -1 : +1;
@@ -161,13 +164,17 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
 
       if (!dagger || Arg::asymmetric) // defined in dslash_wilson.cuh
         applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
       else // special dslash for symmetric dagger
         applyWilsonTM<nParity, dagger, 1, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (xpay && mykernel_type == INTERIOR_KERNEL && not arg.dd_x.isZero(coord)) {
         Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
         if (!dagger || Arg::asymmetric) {
           out += arg.a_inv * (x + arg.b_inv * x.igamma(4)); // apply inverse twist which is undone below
diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh
index 584999d52f..54a941cf5e 100644
--- a/include/kernels/dslash_wilson.cuh
+++ b/include/kernels/dslash_wilson.cuh
@@ -15,8 +15,8 @@ namespace quda
   /**
      @brief Parameter structure for driving the Wilson operator
    */
-  template <typename Float, int nColor_, int nDim, QudaReconstructType reconstruct_, bool distance_pc_ = false>
-  struct WilsonArg : DslashArg<Float, nDim> {
+  template <typename Float, int nColor_, int nDim, typename DDArg, QudaReconstructType reconstruct_, bool distance_pc_ = false>
+  struct WilsonArg : DslashArg<Float, nDim, DDArg> {
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = 4;
     static constexpr bool spin_project = true;
@@ -44,22 +44,18 @@ namespace quda
     /** parameters for distance preconditioning */
     const real alpha0;
     const int t0;
-    const int comm_coord_dim_3;
-    const int comm_dim_dim_3;
 
     WilsonArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo,
               const GaugeField &U, double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
               const int *comm_override, double alpha0 = 0.0, int t0 = -1) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project,
-                             comm_override),
+      DslashArg<Float, nDim, DDArg>(out, in, halo, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project,
+                                    comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
       a(a),
       alpha0(alpha0),
-      t0(t0),
-      comm_coord_dim_3(comm_coord(3) * this->dim[3]),
-      comm_dim_dim_3(comm_dim(3) * this->dim[3])
+      t0(t0)
     {
       for (auto i = 0u; i < out.size(); i++) {
         this->out[i] = out[i];
@@ -92,8 +88,8 @@ namespace quda
     // parity for gauge field - include residual parity from 5-d => 4-d checkerboarding
     const int gauge_parity = (Arg::nDim == 5 ? (coord.x_cb / arg.dc.volume_4d_cb + parity) % 2 : parity);
 
-    const int t = arg.comm_coord_dim_3 + coord[3];
-    const int nt = arg.comm_dim_dim_3;
+    const int t = coord.gx[3];
+    const int nt = arg.globalDim3;
     real fwd_coeff_3
       = Arg::distance_pc ? distanceWeight(arg, t + 1, nt) / distanceWeight(arg, t, nt) : static_cast<real>(1.0);
     real bwd_coeff_3
@@ -101,7 +97,8 @@ namespace quda
 
 #pragma unroll
     for (int d = 0; d < 4; d++) { // loop over dimension - 4 and not nDim since this is used for DWF as well
-      {                           // Forward gather - compute fwd offset for vector fetch
+      // Forward gather - compute fwd offset for vector fetch
+      if (arg.dd_in.doHopping(coord, d, +1)) {
         const real fwd_coeff = (d < 3) ? 1.0 : fwd_coeff_3;
         const int fwd_idx = getNeighborIndexCB(coord, d, +1, arg.dc);
         const int gauge_idx = (Arg::nDim == 5 ? coord.x_cb % arg.dc.volume_4d_cb : coord.x_cb);
@@ -129,7 +126,8 @@ namespace quda
         }
       }
 
-      { // Backward gather - compute back offset for spinor and gauge fetch
+      // Backward gather - compute back offset for spinor and gauge fetch
+      if (arg.dd_in.doHopping(coord, d, -1)) {
         const real bwd_coeff = (d < 3) ? 1.0 : bwd_coeff_3;
         const int back_idx = getNeighborIndexCB(coord, d, -1, arg.dc);
         const int gauge_idx = (Arg::nDim == 5 ? back_idx % arg.dc.volume_4d_cb : back_idx);
@@ -179,11 +177,18 @@ namespace quda
       auto coord = getCoords<QUDA_4D_PC, mykernel_type>(arg, idx, 0, parity, thread_dim);
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
+      int xs = coord.x_cb + coord.s * arg.dc.volume_4d_cb;
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](xs, my_spinor_parity) = out;
+        return;
+      }
+
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      int xs = coord.x_cb + coord.s * arg.dc.volume_4d_cb;
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (xpay && mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (xpay && mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](xs, my_spinor_parity);
         out = x + arg.a * out;
       } else if (mykernel_type != INTERIOR_KERNEL && active) {
diff --git a/include/kernels/dslash_wilson_clover.cuh b/include/kernels/dslash_wilson_clover.cuh
index cb4c75a86b..5746c41014 100644
--- a/include/kernels/dslash_wilson_clover.cuh
+++ b/include/kernels/dslash_wilson_clover.cuh
@@ -7,9 +7,10 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_, bool twist_ = false, bool distance_pc_ = false>
-  struct WilsonCloverArg : WilsonArg<Float, nColor, nDim, reconstruct_, distance_pc_> {
-    using WilsonArg<Float, nColor, nDim, reconstruct_, distance_pc_>::nSpin;
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_, bool twist_ = false,
+            bool distance_pc_ = false>
+  struct WilsonCloverArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_, distance_pc_> {
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_, distance_pc_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     static constexpr bool twist = twist_;
 
@@ -24,8 +25,8 @@ namespace quda
                     const ColorSpinorField &halo, const GaugeField &U, const CloverField &A, double a, double b,
                     cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
                     double alpha0 = 0.0, int t0 = -1) :
-      WilsonArg<Float, nColor, nDim, reconstruct_, distance_pc_>(out, in, halo, U, a, x, parity, dagger, comm_override,
-                                                                 alpha0, t0),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_, distance_pc_>(out, in, halo, U, a, x, parity, dagger,
+                                                                        comm_override, alpha0, t0),
       A(A, false),
       a(a),
       b(dagger ? -0.5 * b : 0.5 * b) // factor of 1/2 comes from clover normalization we need to correct for
@@ -62,10 +63,17 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
 
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      if (mykernel_type == INTERIOR_KERNEL) {
+      if (mykernel_type == INTERIOR_KERNEL and arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
         x.toRel(); // switch to chiral basis
 
diff --git a/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh b/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh
index 994fd2caf1..d354ab07c4 100644
--- a/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh
+++ b/include/kernels/dslash_wilson_clover_hasenbusch_twist.cuh
@@ -7,9 +7,9 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_>
-  struct WilsonCloverHasenbuschTwistArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
-    using WilsonArg<Float, nColor, nDim, reconstruct_>::nSpin;
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct WilsonCloverHasenbuschTwistArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
 
     typedef typename clover_mapper<Float, length>::type C;
@@ -23,7 +23,7 @@ namespace quda
                                    const ColorSpinorField &halo, const GaugeField &U, const CloverField &A, double a,
                                    double b, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                                    const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, a, x, parity, dagger, comm_override),
       A(A, false),
       a(a),
       b(dagger ? -0.5 * b : 0.5 * b) // factor of 1/2 comes from clover normalization we need to correct for
@@ -60,10 +60,17 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
 
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
-      if (mykernel_type == INTERIOR_KERNEL) {
+      if (mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
         x.toRel(); // switch to chiral basis
 
diff --git a/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh b/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
index e4bb7113f0..8b724962c5 100644
--- a/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
+++ b/include/kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh
@@ -7,9 +7,9 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_, bool clov_inv_>
-  struct WilsonCloverHasenbuschTwistPCArg : WilsonArg<Float, nColor, nDim, reconstruct_> {
-    using WilsonArg<Float, nColor, nDim, reconstruct_>::nSpin;
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_, bool clov_inv_>
+  struct WilsonCloverHasenbuschTwistPCArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_> {
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     static constexpr bool dynamic_clover = clover::dynamic_inverse();
     static constexpr bool clov_inv = clov_inv_;
@@ -25,7 +25,7 @@ namespace quda
                                      const ColorSpinorField &halo, const GaugeField &U, const CloverField &A_,
                                      double a_, double b_, cvector_ref<const ColorSpinorField> &x, int parity,
                                      bool dagger, const int *comm_override) :
-      WilsonArg<Float, nColor, nDim, reconstruct_>(out, in, halo, U, a_, x, parity, dagger, comm_override),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_>(out, in, halo, U, a_, x, parity, dagger, comm_override),
       A(A_, false),
       A_inv(A_, dynamic_clover ? false : true),
       b(dagger ? -0.5 * b_ : 0.5 * b_) // if dynamic clover we don't want the inverse field
@@ -64,6 +64,11 @@ namespace quda
 
       Vector out;
 
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
 
@@ -75,7 +80,9 @@ namespace quda
 
       if (isComplete<mykernel_type>(arg, coord) && active) {
 
-        if (!Arg::clov_inv) {
+        if (!Arg::clov_inv and arg.dd_x.isZero(coord)) {
+          out = arg.a * out;
+        } else if (!Arg::clov_inv) {
           Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
           out = x + arg.a * out;
         } else {
@@ -100,14 +107,18 @@ namespace quda
           }
 
           tmp.toNonRel(); // switch back to non-chiral basis
-          Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
-          out = x + arg.a * tmp;
+          if (arg.dd_x.isZero(coord)) {
+            out = arg.a * tmp;
+          } else {
+            Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
+            out = x + arg.a * tmp;
+          }
         }
 
         // At this point: out = x + k A^{-1} D in or out = x + k D in
         //
         // now we must add on i g_5 b A x
-        {
+        if (not arg.dd_x.isZero(coord)) {
           Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
           x.toRel();
           Vector tmp;
diff --git a/include/kernels/dslash_wilson_clover_preconditioned.cuh b/include/kernels/dslash_wilson_clover_preconditioned.cuh
index 86d5c71534..0b2fb015e8 100644
--- a/include/kernels/dslash_wilson_clover_preconditioned.cuh
+++ b/include/kernels/dslash_wilson_clover_preconditioned.cuh
@@ -7,9 +7,9 @@
 namespace quda
 {
 
-  template <typename Float, int nColor, int nDim, QudaReconstructType reconstruct_, bool distance_pc_ = false>
-  struct WilsonCloverArg : WilsonArg<Float, nColor, nDim, reconstruct_, distance_pc_> {
-    using WilsonArg<Float, nColor, nDim, reconstruct_, distance_pc_>::nSpin;
+  template <typename Float, int nColor, int nDim, typename DDArg, QudaReconstructType reconstruct_, bool distance_pc_ = false>
+  struct WilsonCloverArg : WilsonArg<Float, nColor, nDim, DDArg, reconstruct_, distance_pc_> {
+    using WilsonArg<Float, nColor, nDim, DDArg, reconstruct_, distance_pc_>::nSpin;
     static constexpr int length = (nSpin / (nSpin / 2)) * 2 * nColor * nColor * (nSpin / 2) * (nSpin / 2) / 2;
     static constexpr bool dynamic_clover = clover::dynamic_inverse();
 
@@ -23,8 +23,8 @@ namespace quda
                     const ColorSpinorField &halo, const GaugeField &U, const CloverField &A, double a,
                     cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
                     double alpha0 = 0.0, int t0 = -1) :
-      WilsonArg<Float, nColor, nDim, reconstruct_, distance_pc_>(out, in, halo, U, a, x, parity, dagger, comm_override,
-                                                                 alpha0, t0),
+      WilsonArg<Float, nColor, nDim, DDArg, reconstruct_, distance_pc_>(out, in, halo, U, a, x, parity, dagger,
+                                                                        comm_override, alpha0, t0),
       A(A, dynamic_clover ? false : true), // if dynamic clover we don't want the inverse field
       a(a)
     {
@@ -61,6 +61,10 @@ namespace quda
       const int my_spinor_parity = nParity == 2 ? parity : 0;
 
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (mykernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
 
       // defined in dslash_wilson.cuh
       applyWilson<nParity, dagger, mykernel_type>(out, arg, coord, parity, idx, thread_dim, active, src_idx);
@@ -94,7 +98,9 @@ namespace quda
 
         tmp.toNonRel(); // switch back to non-chiral basis
 
-        if (xpay) {
+        if (xpay and arg.dd_x.isZero(coord)) {
+          out = arg.a * tmp;
+        } else if (xpay) {
           Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
           out = x + arg.a * tmp;
         } else {
diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh
index a3c2b1b377..70e758be76 100644
--- a/include/kernels/laplace.cuh
+++ b/include/kernels/laplace.cuh
@@ -14,8 +14,8 @@ namespace quda
   /**
      @brief Parameter structure for driving the covariatnt derivative operator
   */
-  template <typename Float, int nSpin_, int nColor_, int nDim, QudaReconstructType reconstruct_>
-  struct LaplaceArg : DslashArg<Float, nDim> {
+  template <typename Float, int nSpin_, int nColor_, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct LaplaceArg : DslashArg<Float, nDim, DDArg> {
     static constexpr int nColor = nColor_;
     static constexpr int nSpin = nSpin_;
     static constexpr bool spin_project = false;
@@ -45,7 +45,7 @@ namespace quda
     LaplaceArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                const ColorSpinorField &halo, const GaugeField &U, int dir, double a, double b,
                cvector_ref<const ColorSpinorField> &x, int parity, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, x, parity, false, a != 0.0 ? true : false, 1, false, comm_override),
+      DslashArg<Float, nDim, DDArg>(out, in, halo, U, x, parity, false, a != 0.0 ? true : false, 1, false, comm_override),
       halo_pack(halo),
       halo(halo),
       U(U),
@@ -85,7 +85,7 @@ namespace quda
 #pragma unroll
     for (int d = 0; d < Arg::nDim; d++) { // loop over dimension
       if (d != dir) {
-        {
+        if (arg.dd_in.doHopping(coord, d, +1)) {
           // Forward gather - compute fwd offset for vector fetch
           const bool ghost = (coord[d] + 1 >= arg.dim[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg);
 	  
@@ -106,7 +106,7 @@ namespace quda
             out += U * in;
           }
         }
-        {
+        if (arg.dd_in.doHopping(coord, d, -1)) {
           // Backward gather - compute back offset for spinor and gauge fetch
 
           const int back_idx = linkIndexM1(coord, arg.dim, d);
@@ -159,6 +159,10 @@ namespace quda
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (kernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
 
       // We instantiate two kernel types:
       // case 4 is an operator in all x,y,z,t dimensions
@@ -171,7 +175,9 @@ namespace quda
         break;
       }
 
-      if (xpay && mykernel_type == INTERIOR_KERNEL) {
+      if (xpay && mykernel_type == INTERIOR_KERNEL && arg.dd_x.isZero(coord)) {
+        out = arg.a * out;
+      } else if (xpay && mykernel_type == INTERIOR_KERNEL) {
         Vector x = arg.x[src_idx](coord.x_cb, my_spinor_parity);
         out = arg.a * out + arg.b * x;
       } else if (mykernel_type != INTERIOR_KERNEL) {
diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh
index 85e1790318..a6eff554e6 100644
--- a/include/kernels/staggered_quark_smearing.cuh
+++ b/include/kernels/staggered_quark_smearing.cuh
@@ -14,8 +14,8 @@ namespace quda
   /**
      @brief Parameter structure for driving the covariant derivative operator
   */
-  template <typename Float, int nSpin_, int nColor_, int nDim, QudaReconstructType reconstruct_>
-  struct StaggeredQSmearArg : DslashArg<Float, nDim> {
+  template <typename Float, int nSpin_, int nColor_, int nDim, typename DDArg, QudaReconstructType reconstruct_>
+  struct StaggeredQSmearArg : DslashArg<Float, nDim, DDArg> {
     static constexpr int nColor = 3;
     static constexpr int nSpin = 1;
     static constexpr bool spin_project = false;
@@ -50,7 +50,7 @@ namespace quda
     StaggeredQSmearArg(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                        const ColorSpinorField &halo, const GaugeField &U, int t0, bool is_t0_kernel, int parity,
                        int dir, bool dagger, const int *comm_override) :
-      DslashArg<Float, nDim>(out, in, halo, U, in, parity, dagger, false, 3, false, comm_override),
+      DslashArg<Float, nDim, DDArg>(out, in, halo, U, in, parity, dagger, false, 3, false, comm_override),
       halo_pack(halo, 3),
       halo(halo, 3),
       U(U),
@@ -106,7 +106,7 @@ namespace quda
 #pragma unroll
     for (int d = 0; d < Arg::nDim; d++) { // loop over dimension
       if (d != dir) {
-        {
+        if (arg.dd_in.doHopping(coord, d, +2)) {
           // Forward gather - compute fwd offset for vector fetch
           const bool ghost
             = (coord[d] + 2 >= arg.dim[d]) && isActive<kernel_type>(active, thread_dim, d, coord, arg); // 1=>2
@@ -128,7 +128,7 @@ namespace quda
             out = mv_add(U_2link, in_2hop, out);
           }
         }
-        {
+        if (arg.dd_in.doHopping(coord, d, -2)) {
           // Backward gather - compute back offset for spinor and gauge fetch
           const bool ghost = (coord[d] - 2 < 0) && isActive<kernel_type>(active, thread_dim, d, coord, arg); // 1=>2
 
@@ -203,6 +203,11 @@ namespace quda
 
       const int my_spinor_parity = nParity == 2 ? parity : 0;
       Vector out;
+      if (arg.dd_out.isZero(coord)) {
+        if (kernel_type != EXTERIOR_KERNEL_ALL || active) arg.out[src_idx](coord.x_cb, my_spinor_parity) = out;
+        return;
+      }
+
       // We instantiate two kernel types:
       // case 4 is an operator in all x,y,z,t dimensions
       // case 3 is a spatial operator only, the t dimension is omitted.
diff --git a/include/lattice_field.h b/include/lattice_field.h
index 425432d6a6..e86f69f1c3 100644
--- a/include/lattice_field.h
+++ b/include/lattice_field.h
@@ -7,6 +7,7 @@
 #include <object.h>
 #include <quda_api.h>
 #include <reference_wrapper_helper.h>
+#include <domain_decomposition.h>
 
 /**
  * @file lattice_field.h
@@ -962,6 +963,39 @@ namespace quda {
 
 #define checkNative(...) Native_(__func__, __FILE__, __LINE__, __VA_ARGS__)
 
+  /**
+     @brief Helper function for determining if the domain decomposition of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @return true if all fields match
+   */
+  template <typename T1, typename T2>
+  inline bool DD_(const char *func, const char *file, int line, const T1 &a_, const T2 &b_)
+  {
+    const unwrap_t<T1> &a(a_);
+    const unwrap_t<T2> &b(b_);
+    if (!a.DD().check(a, true)) errorQuda("DD checks not passed (%s:%d in %s())", file, line, func);
+    if (!b.DD().check(b, true)) errorQuda("DD checks not passed (%s:%d in %s())", file, line, func);
+    if (!a.DD().match(b.DD(), true)) errorQuda("DD not match (%s:%d in %s())", file, line, func);
+    return true;
+  }
+
+  /**
+     @brief Helper function for determining if the domain decomposition of the fields is the same.
+     @param[in] a Input field
+     @param[in] b Input field
+     @param[in] args List of additional fields to check domain decomposition on
+     @return true if all fields match
+   */
+  template <typename T1, typename T2, typename... Args>
+  inline bool DD_(const char *func, const char *file, int line, const T1 &a, const T2 &b, const Args &...args)
+  {
+    // checking all possible pairs
+    return (DD_(func, file, line, a, b) && DD_(func, file, line, a, args...) && DD_(func, file, line, b, args...));
+  }
+
+#define checkDD(...) DD_(__func__, __FILE__, __LINE__, __VA_ARGS__)
+
   /**
      @brief Return whether data is reordered on the CPU or GPU.  This can set
      at QUDA initialization using the environment variable
diff --git a/include/quda_define.h.in b/include/quda_define.h.in
index 90dc4b0b81..dfab066518 100644
--- a/include/quda_define.h.in
+++ b/include/quda_define.h.in
@@ -126,6 +126,12 @@
 #define GPU_LAPLACE
 #endif
 
+/**
+ * @def   QUDA_DOMAIN_DECOMPOSITION
+ * @brief This macro sets the type of Domain Decomposition (DD)-aware Dirac operator enabled
+ */
+#define QUDA_DOMAIN_DECOMPOSITION @QUDA_DOMAIN_DECOMPOSITION@
+
 #cmakedefine QUDA_DIRAC_DISTANCE_PRECONDITIONING
 #ifdef QUDA_DIRAC_DISTANCE_PRECONDITIONING
 /**
diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h
index ed27d17106..8a1c8e48e0 100644
--- a/include/reference_wrapper_helper.h
+++ b/include/reference_wrapper_helper.h
@@ -7,6 +7,7 @@
 #include <enum_quda.h>
 #include <util_quda.h>
 #include <quda_internal.h>
+#include <domain_decomposition.h>
 
 namespace quda
 {
@@ -375,6 +376,15 @@ namespace quda
       return operator[](0).X(d);
     }
 
+    template <class U = T>
+    std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, int> full_dim(int d) const
+    {
+      for (auto i = 1u; i < vector::size(); i++)
+        if (operator[](i - 1).full_dim(d) != operator[](i).full_dim(d))
+          errorQuda("Dimension %d does not match %d != %d", d, operator[](i - 1).full_dim(d), operator[](i).full_dim(d));
+      return operator[](0).full_dim(d);
+    }
+
     template <class U = T>
     std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, size_t> Length() const
     {
@@ -459,6 +469,25 @@ namespace quda
     {
       return operator[](0).AuxString();
     }
+
+    template <class U = T>
+    std::enable_if_t<std::is_same_v<std::remove_const_t<U>, ColorSpinorField>, const DDParam> DD() const
+    {
+      for (auto i = 1u; i < vector::size(); i++)
+        if (operator[](i - 1).DD() != operator[](i).DD()) errorQuda("DD do not match %d != %d", i - 1, i);
+      return operator[](0).DD();
+    }
+
+    template <class U = T, typename... Args>
+    std::enable_if_t<std::is_same_v<U, ColorSpinorField>, void> DD(const quda::DD &flag, const Args &...args)
+    {
+      for (auto i = 0u; i < vector::size(); i++) operator[](i).DD(flag, args...);
+    }
+
+    template <class U = T, typename... Args> std::enable_if_t<std::is_same_v<U, ColorSpinorField>, void> projectDD()
+    {
+      for (auto i = 0u; i < vector::size(); i++) operator[](i).projectDD();
+    }
   };
 
   template <class T> using cvector_ref = const vector_ref<T>;
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index d88ba6a239..0ca7e5f7bd 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -34,7 +34,7 @@ set (QUDA_OBJS
   inv_cg3_quda.cpp inv_ca_gcr.cpp inv_ca_cg.cpp
   inv_gcr_quda.cpp inv_mr_quda.cpp inv_sd_quda.cpp
   inv_pcg_quda.cpp inv_mre.cpp interface_quda.cpp util_quda.cpp
-  color_spinor_field.cpp color_spinor_util.cu
+  color_spinor_field.cpp color_spinor_util.cu color_spinor_project_dd.cu
   field_cache.cpp
   gauge_covdev.cpp dirac.cpp
   clover_field.cpp lattice_field.cpp gauge_field.cpp
@@ -189,6 +189,127 @@ configure_file(dslash_coarse_mma.in.hpp dslash_coarse_mma.hpp @ONLY)
 configure_file(block_transpose.in.cu block_transpose.cu @ONLY)
 configure_file(multigrid.in.hpp multigrid.hpp @ONLY)
 
+# Function to check if a specific bit is set
+function(check_bit number bit)
+  # Perform bitwise AND with (1 << bit) to check if the bit is set
+  math(EXPR result "${number} & (1 << ${bit})")
+  set(BIT_CHECK "${result}" PARENT_SCOPE)
+endfunction()
+
+set(QUDA_DSLASH_FILENAME_LIST "") 
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_wilson")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_wilson_clover")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_wilson_clover_preconditioned")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_wilson_clover_hasenbusch_twist")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_wilson_clover_hasenbusch_twist_preconditioned_NoClovInv")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_wilson_clover_hasenbusch_twist_preconditioned_ClovInv")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_twisted_mass")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_twisted_mass_preconditioned")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_twisted_clover")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_twisted_clover_preconditioned")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_ndeg_twisted_mass")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_ndeg_twisted_mass_preconditioned")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_ndeg_twisted_clover")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_ndeg_twisted_clover_preconditioned")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_staggered")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_improved_staggered")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_domain_wall_4d")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "dslash_domain_wall_5d")
+list(APPEND QUDA_DSLASH_FILENAME_LIST "laplace")
+
+set(QUDA_DSLASH_PREC_LIST "")
+set(QUDA_DSLASH_NCOLOR_LIST "3")
+set(QUDA_DSLASH_DDARG_LIST "DDNo")
+set(QUDA_DSLASH_RECONI_LIST "")
+set(QUDA_DSLASH_DISTANCE_LIST "false")
+set(QUDA_DSLASH_DWTYPE_LIST "") 
+
+if(QUDA_DIRAC_DISTANCE_PRECONDITIONING)
+  list(APPEND QUDA_DSLASH_DISTANCE_LIST "true")
+endif()
+
+if(QUDA_DIRAC_DOMAIN_WALL)
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "DSLASH5_MOBIUS_PRE")
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "DSLASH5_MOBIUS")
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "M5_INV_MOBIUS")
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "M5_INV_MOBIUS_M5_PRE")
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "M5_PRE_MOBIUS_M5_INV")
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "M5_INV_MOBIUS_M5_INV_DAG")
+  list(APPEND QUDA_DSLASH_DWTYPE_LIST "DSLASH5_MOBIUS_PRE_M5_MOB")
+endif()
+
+check_bit(${QUDA_PRECISION} 0)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_PREC_LIST "QUARTER")
+endif()
+
+check_bit(${QUDA_PRECISION} 1)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_PREC_LIST "HALF")
+endif()
+
+check_bit(${QUDA_PRECISION} 2)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_PREC_LIST "SINGLE")
+endif()
+
+check_bit(${QUDA_PRECISION} 3)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_PREC_LIST "DOUBLE")
+endif()
+
+check_bit(${QUDA_RECONSTRUCT} 0)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_RECONI_LIST "2")
+endif()
+
+check_bit(${QUDA_RECONSTRUCT} 1)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_RECONI_LIST "1")
+endif()
+
+check_bit(${QUDA_RECONSTRUCT} 2)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_RECONI_LIST "0")
+endif()
+
+check_bit(${QUDA_DOMAIN_DECOMPOSITION} 0)
+if(BIT_CHECK)
+  list(APPEND QUDA_DSLASH_DDARG_LIST "DDRedBlack")
+endif()
+
+message(STATUS "QUDA_DSLASH_FILENAME_LIST=${QUDA_DSLASH_FILENAME_LIST}")
+message(STATUS "QUDA_DSLASH_PREC_LIST=${QUDA_DSLASH_PREC_LIST}")
+message(STATUS "QUDA_DSLASH_NCOLOR_LIST=${QUDA_DSLASH_NCOLOR_LIST}")
+message(STATUS "QUDA_DSLASH_DDARG_LIST=${QUDA_DSLASH_DDARG_LIST}")
+message(STATUS "QUDA_DSLASH_RECONI_LIST=${QUDA_DSLASH_RECONI_LIST}")
+message(STATUS "QUDA_DSLASH_DISTANCE_LIST=${QUDA_DSLASH_DISTANCE_LIST}")
+message(STATUS "QUDA_DSLASH_DWTYPE_LIST=${QUDA_DSLASH_DWTYPE_LIST}")
+
+foreach(QUDA_DSLASH_PREC ${QUDA_DSLASH_PREC_LIST})
+  string(TOLOWER "${QUDA_DSLASH_PREC}" QUDA_DSLASH_PREC_LOWER)
+  foreach(QUDA_DSLASH_NCOLOR ${QUDA_DSLASH_NCOLOR_LIST})
+    foreach(QUDA_DSLASH_DDARG ${QUDA_DSLASH_DDARG_LIST})
+      foreach(QUDA_DSLASH_RECONI ${QUDA_DSLASH_RECONI_LIST})
+	foreach(QUDA_DSLASH_DISTANCE ${QUDA_DSLASH_DISTANCE_LIST})
+	  set(DSLASH_SPECS "${QUDA_DSLASH_PREC_LOWER}_nc_${QUDA_DSLASH_NCOLOR}_${QUDA_DSLASH_DDARG}_recon${QUDA_DSLASH_RECONI}_dist-${QUDA_DSLASH_DISTANCE}")
+          foreach(QUDA_DSLASH_FILENAME ${QUDA_DSLASH_FILENAME_LIST})
+	    set(TMP_FILENAME "${QUDA_DSLASH_FILENAME}_${DSLASH_SPECS}.cu")
+	    configure_file(${QUDA_DSLASH_FILENAME}.in.cu ${TMP_FILENAME} @ONLY)
+	    list(PREPEND QUDA_CU_OBJS ${TMP_FILENAME})
+	  endforeach()
+	  foreach(QUDA_DSLASH_DWTYPE ${QUDA_DSLASH_DWTYPE_LIST})
+	    string(TOLOWER "${QUDA_DSLASH_DWTYPE}" QUDA_DSLASH_DWTYPE_LOWER)
+	    set(TMP_FILENAME "dslash_domain_wall_4d_${QUDA_DSLASH_DWTYPE_LOWER}_${DSLASH_SPECS}.cu")
+	    configure_file(dslash_domain_wall_4d_fused_m5.in.cu ${TMP_FILENAME} @ONLY)
+            list(PREPEND QUDA_CU_OBJS ${TMP_FILENAME})
+          endforeach()
+	endforeach()
+      endforeach()
+    endforeach()
+  endforeach()
+endforeach()
+
 if(QUDA_MULTIGRID)
   string(REPLACE "," ";" QUDA_MULTIGRID_NVEC_LIST_SEMICOLON "${QUDA_MULTIGRID_NVEC_LIST}")
   string(REPLACE "," ";" QUDA_MULTIGRID_MRHS_LIST_SEMICOLON "${QUDA_MULTIGRID_MRHS_LIST}")
@@ -502,6 +623,7 @@ endif()
 # set which precisions to enable
 target_compile_definitions(quda PUBLIC QUDA_PRECISION=${QUDA_PRECISION})
 target_compile_definitions(quda PUBLIC QUDA_RECONSTRUCT=${QUDA_RECONSTRUCT})
+target_compile_definitions(quda PUBLIC QUDA_DSLASH5TYPE=${QUDA_DSLASH5TYPE})
 
 if(QUDA_FAST_COMPILE_REDUCE)
   target_compile_definitions(quda PRIVATE QUDA_FAST_COMPILE_REDUCE)
diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp
index ae24f19417..8f0239bb37 100644
--- a/lib/color_spinor_field.cpp
+++ b/lib/color_spinor_field.cpp
@@ -99,6 +99,7 @@ namespace quda
     nVec = param.nVec;
     nVec_actual = param.nVec_actual;
     twistFlavor = param.twistFlavor;
+    dd = param.dd;
 
     if (param.pc_type != QUDA_5D_PC && param.pc_type != QUDA_4D_PC) errorQuda("Unexpected pc_type %d", param.pc_type);
     pc_type = param.pc_type;
@@ -540,6 +541,7 @@ namespace quda
     param.pc_type = pc_type;
     param.suggested_parity = suggested_parity;
     param.create = QUDA_NULL_FIELD_CREATE;
+    param.dd = dd;
   }
 
   void ColorSpinorField::exchange(void **ghost, void **sendbuf, int nFace) const
@@ -1557,6 +1559,8 @@ namespace quda
     genericPrintVector(*this, parity, x_cb, rank);
   }
 
+  void ColorSpinorField::projectDD() { genericProjectDD(*this); }
+
   int ColorSpinorField::Compare(const ColorSpinorField &a, const ColorSpinorField &b, const int tol)
   {
     if (checkLocation(a, b) == QUDA_CUDA_FIELD_LOCATION) errorQuda("device field not implemented");
diff --git a/lib/color_spinor_project_dd.cu b/lib/color_spinor_project_dd.cu
new file mode 100644
index 0000000000..b364e877b1
--- /dev/null
+++ b/lib/color_spinor_project_dd.cu
@@ -0,0 +1,141 @@
+#include <tuple>
+#include <memory>
+#include <color_spinor_field.h>
+#include <color_spinor_field_order.h>
+#include <index_helper.cuh>
+#include <blas_quda.h>
+#include <instantiate.h>
+#include <domain_decomposition_helper.cuh>
+#include <tunable_nd.h>
+#include <kernels/color_spinor_project_dd.cuh>
+
+namespace quda
+{
+
+  template <typename Float, typename DDArg, int nSpin, int nColor, typename Order>
+  class ProjectDD : public TunableKernel2D
+  {
+    using Arg = ProjectDDArg<Float, DDArg, nSpin, nColor, Order>;
+    ColorSpinorField &out;
+
+    bool advanceSharedBytes(TuneParam &) const { return false; } // Don't tune shared mem
+    unsigned int minThreads() const { return out.VolumeCB(); }
+
+  public:
+    ProjectDD(ColorSpinorField &out) : TunableKernel2D(out, out.SiteSubset()), out(out)
+    {
+      strcat(aux, out.AuxString().c_str());
+      switch (out.DD().type) {
+      case QUDA_DD_NO: strcat(aux, ",DDNo"); break;
+      case QUDA_DD_RED_BLACK: strcat(aux, ",DDRedBlack"); break;
+      default: errorQuda("DD type %d not implemented", out.DD().type);
+      }
+
+      apply(device::get_default_stream());
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      constexpr bool enable_host = true;
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      launch<ProjectDD_, enable_host>(tp, stream, Arg(out));
+    }
+
+    long long bytes() const { return out.Bytes(); }
+  };
+
+  template <typename P, typename DDArg> void projectDD(P &p, DDArg &dd, const ColorSpinorField &meta)
+  {
+    Coord<4> coord;
+    int X[4] = {meta.full_dim(0), meta.full_dim(1), meta.full_dim(2), meta.full_dim(3)};
+    int commCoord[4] = {comm_coord(0) * X[0], comm_coord(1) * X[1], comm_coord(2) * X[2], comm_coord(3) * X[3]};
+
+    for (int parity = 0; parity < p.Nparity(); parity++) {
+      for (int x_cb = 0; x_cb < p.VolumeCB(); x_cb++) {
+        getCoords(coord, x_cb, X, parity);
+        for (int i = 0; i < coord.size(); i++) { coord.gx[i] = commCoord[i] + coord.x[i]; }
+
+        if (dd.isZero(coord)) {
+          for (int s = 0; s < p.Nspin(); s++)
+            for (int c = 0; c < p.Ncolor(); c++) p(parity, x_cb, s, c) = 0;
+        }
+      }
+    }
+  }
+
+  template <typename Float, typename DDArg, int nSpin, int nColor, typename Order>
+  void genericProjectDD(ColorSpinorField &a)
+  {
+    /* Reference CPU implementation
+    if (a.Location() == QUDA_CPU_FIELD_LOCATION and a.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
+      FieldOrderCB<Float, nSpin, nColor, 1, QUDA_SPACE_SPIN_COLOR_FIELD_ORDER> A(a);
+      DDArg dd(a);
+      return projectDD(A, dd, a);
+    } */
+
+    ProjectDD<Float, DDArg, nSpin, nColor, Order> A(a);
+  }
+
+  /** Decide on the field order*/
+  template <typename Float, typename DDArg, int nSpin, int nColor> void genericProjectDD(ColorSpinorField &a)
+  {
+    if (a.isNative()) {
+      using Order = typename colorspinor_mapper<Float, nSpin, nColor>::type;
+      genericProjectDD<Float, DDArg, nSpin, nColor, Order>(a);
+    } else if (a.FieldOrder() == QUDA_SPACE_SPIN_COLOR_FIELD_ORDER) {
+      using Order = SpaceSpinorColorOrder<Float, nSpin, nColor>;
+      genericProjectDD<Float, DDArg, nSpin, nColor, Order>(a);
+    } else if (a.FieldOrder() == QUDA_SPACE_COLOR_SPIN_FIELD_ORDER) {
+      using Order = SpaceColorSpinorOrder<Float, nSpin, nColor>;
+      genericProjectDD<Float, DDArg, nSpin, nColor, Order>(a);
+    } else if (a.FieldOrder() == QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER) {
+      using Order = PaddedSpaceSpinorColorOrder<Float, nSpin, nColor>;
+      if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>())
+        genericProjectDD<Float, DDArg, nSpin, nColor, Order>(a);
+      else
+        errorQuda("TIFR interface has not been built");
+    } else if (a.FieldOrder() == QUDA_QDPJIT_FIELD_ORDER) {
+      using Order = QDPJITDiracOrder<Float, nSpin, nColor>;
+      if constexpr (is_enabled<QUDA_QDPJIT_GAUGE_ORDER>())
+        genericProjectDD<Float, DDArg, nSpin, nColor, Order>(a);
+      else
+        errorQuda("QDPJIT interface has not been built");
+    } else {
+      errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", a.FieldOrder(), nSpin, nColor, a.Precision());
+    }
+  }
+
+  template <typename Float, typename DDArg> void genericProjectDD(ColorSpinorField &a)
+  {
+    switch (a.Nspin()) {
+    case (1):
+      if constexpr (is_enabled_spin(1)) genericProjectDD<Float, DDArg, 1, 3>(a);
+      break;
+    case (2):
+      if constexpr (is_enabled_spin(2)) genericProjectDD<Float, DDArg, 2, 3>(a);
+      break;
+    case (4):
+      if constexpr (is_enabled_spin(4)) genericProjectDD<Float, DDArg, 4, 3>(a);
+      break;
+    default: errorQuda("Nspin %d not implemented", a.Nspin());
+    }
+  }
+
+  template <typename Float> void genericProjectDD(ColorSpinorField &a)
+  {
+    switch (a.DD().type) {
+    case QUDA_DD_NO: genericProjectDD<Float, DDNo>(a); break;
+    case QUDA_DD_RED_BLACK: genericProjectDD<Float, DDRedBlack>(a); break;
+    default: errorQuda("DD type %d not implemented", a.DD().type);
+    }
+  }
+
+  void genericProjectDD(ColorSpinorField &a)
+  {
+    switch (a.Precision()) {
+    case QUDA_DOUBLE_PRECISION: genericProjectDD<double>(a); break;
+    case QUDA_SINGLE_PRECISION: genericProjectDD<float>(a); break;
+    default: errorQuda("Precision %d not implemented", a.Precision());
+    }
+  }
+} // namespace quda
diff --git a/lib/color_spinor_util.in.cu b/lib/color_spinor_util.in.cu
index 3529d64e5a..0d1369e40b 100644
--- a/lib/color_spinor_util.in.cu
+++ b/lib/color_spinor_util.in.cu
@@ -5,6 +5,7 @@
 #include <index_helper.cuh>
 #include <blas_quda.h>
 #include <instantiate.h>
+#include <domain_decomposition_helper.cuh>
 #include <int_list.hpp>
 
 namespace quda {
diff --git a/lib/covariant_derivative.cu b/lib/covariant_derivative.cu
index 920eba2db1..a6b937dedc 100644
--- a/lib/covariant_derivative.cu
+++ b/lib/covariant_derivative.cu
@@ -87,8 +87,8 @@ namespace quda
     long long bytes() const override
     {
       int gauge_bytes = arg.reconstruct * in.Precision();
-      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() +
-        (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
+      int spinor_bytes
+        = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
       int ghost_bytes = gauge_bytes + 3 * spinor_bytes; // 3 since we have to load the partial
       int dim = arg.mu % 4;
       long long bytes_ = 0;
@@ -132,7 +132,7 @@ namespace quda
     }
   };
 
-  template <typename Float, int nColor, QudaReconstructType recon> struct CovDevApply {
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct CovDevApply {
 
     CovDevApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                 cvector_ref<const ColorSpinorField> &, const GaugeField &U, int mu, int parity, bool dagger,
@@ -142,11 +142,11 @@ namespace quda
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in, 1, false);
       if (in.Nspin() == 4) {
-        CovDevArg<Float, 4, nColor, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
+        CovDevArg<Float, 4, nColor, DDArg, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
         CovDev<decltype(arg)> covDev(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(covDev)> policy(covDev, in, halo, profile);
       } else if (in.Nspin() == 1) {
-        CovDevArg<Float, 1, nColor, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
+        CovDevArg<Float, 1, nColor, DDArg, recon, nDim> arg(out, in, halo, U, mu, parity, dagger, comm_override);
         CovDev<decltype(arg)> covDev(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(covDev)> policy(covDev, in, halo, profile);
       } else {
diff --git a/lib/dslash_domain_wall_4d.cu b/lib/dslash_domain_wall_4d.cu
index 9ba7d1d689..4089091df3 100644
--- a/lib/dslash_domain_wall_4d.cu
+++ b/lib/dslash_domain_wall_4d.cu
@@ -1,11 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_domain_wall_4d.cuh>
-
+#define SIGNATURE_ONLY
+#include <dslash_domain_wall_4d.hpp>
+#undef SIGNATURE_ONLY
 /**
    This is the gauged domain-wall 4-d preconditioned operator.
 
@@ -16,42 +11,6 @@
 namespace quda
 {
 
-  template <typename Arg> class DomainWall4D : public Dslash<domainWall4D, Arg>
-  {
-    using Dslash = Dslash<domainWall4D, Arg>;
-    using Dslash::arg;
-    using Dslash::in;
-
-  public:
-    DomainWall4D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      Dslash::template instantiate<packShmem>(tp, stream);
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct DomainWall4DApply {
-
-    DomainWall4DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_5,
-                      const Complex *b_5, const Complex *c_5, int parity, bool dagger, const int *comm_override,
-                      TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      DomainWall4DArg<Float, nColor, nDim, recon> arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, parity, dagger,
-                                                      comm_override);
-      DomainWall4D<decltype(arg)> dwf(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, in, halo, profile);
-    }
-  };
 
   // Apply the 4-d preconditioned domain-wall Dslash operator
   // out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
@@ -61,7 +20,8 @@ namespace quda
                          TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>() || is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
-      instantiate<DomainWall4DApply>(out, in, x, U, a, m_5, b_5, c_5, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<DomainWall4DApply>(out, in, x, U, a, m_5, b_5, c_5, parity, dagger, comm_override, dummy, profile);
     } else {
       errorQuda("Domain-wall dslash has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d.hpp b/lib/dslash_domain_wall_4d.hpp
new file mode 100644
index 0000000000..a2aef1bd39
--- /dev/null
+++ b/lib/dslash_domain_wall_4d.hpp
@@ -0,0 +1,63 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_domain_wall_4d.cuh>
+
+/**
+   This is the gauged domain-wall 4-d preconditioned operator.
+
+   Note, for now, this just applies a batched 4-d dslash across the fifth
+   dimension.
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class DomainWall4D : public Dslash<domainWall4D, Arg>
+  {
+    using Dslash = Dslash<domainWall4D, Arg>;
+    using Dslash::arg;
+    using Dslash::in;
+
+  public:
+    DomainWall4D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                 const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      Dslash::template instantiate<packShmem>(tp, stream);
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall4DApply {
+    template <bool distance_pc>
+    DomainWall4DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_5,
+                      const Complex *b_5, const Complex *c_5, int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>,
+                      TimeProfile &profile)
+
+#ifdef SIGNATURE_ONLY
+	  ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      DomainWall4DArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, parity,
+                                                             dagger, comm_override);
+      DomainWall4D<decltype(arg)> dwf(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, in, halo, profile);
+    }
+#endif
+  };
+} // namespace quda
diff --git a/lib/dslash_domain_wall_4d.in.cu b/lib/dslash_domain_wall_4d.in.cu
new file mode 100644
index 0000000000..f127797e4e
--- /dev/null
+++ b/lib/dslash_domain_wall_4d.in.cu
@@ -0,0 +1,22 @@
+#include <dslash_domain_wall_4d.hpp>
+
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct DomainWall4DApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+   
+  template DomainWall4DApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>:: 
+    DomainWall4DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_5,
+                      const Complex *b_5, const Complex *c_5, int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>,
+                      TimeProfile &profile);
+} // namespace quda
diff --git a/lib/dslash_domain_wall_4d_fused_m5.hpp b/lib/dslash_domain_wall_4d_fused_m5.hpp
index 37b0ca69da..1ec465e318 100644
--- a/lib/dslash_domain_wall_4d_fused_m5.hpp
+++ b/lib/dslash_domain_wall_4d_fused_m5.hpp
@@ -120,30 +120,38 @@ namespace quda
   template <Dslash5Type...> struct Dslash5TypeList {
   };
 
-  template <typename Float, int nColor, QudaReconstructType recon> struct DomainWall4DApplyFusedM5 {
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall4DApplyFusedM5 {
 
-    template <Dslash5Type dslash5_type_impl, Dslash5Type... N>
+    template <bool distance_pc, Dslash5Type dslash5_type_impl, Dslash5Type... N>
     DomainWall4DApplyFusedM5(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
                              cvector_ref<ColorSpinorField> &y, const Complex *b_5, const Complex *c_5, double a,
                              double m_5, int parity, bool dagger, const int *comm_override, double m_f,
-                             Dslash5TypeList<dslash5_type_impl, N...>, TimeProfile &profile)
+                             DistanceType<distance_pc>, Dslash5TypeList<dslash5_type_impl, N...>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+      ;
+#else
     {
 #ifdef NVSHMEM_COMMS
       errorQuda("Fused Mobius/DWF-4D kernels do not currently work with NVSHMEM.");
 #else
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
-      using Arg = DomainWall4DFusedM5Arg<Float, nColor, nDim, recon, dslash5_type_impl>;
+      using Arg = DomainWall4DFusedM5Arg<Float, nColor, nDim, DDArg, recon, dslash5_type_impl>;
       Arg arg(out, in, halo, U, a, m_5, b_5, c_5, a != 0.0, x, y, parity, dagger, comm_override, m_f);
       DomainWall4DFusedM5<Arg> dwf(arg, out, in, halo, y);
       dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, in, halo, profile);
 #endif
     }
+#endif
   };
 
   // use custom instantiate to deal with field splitting if needed
-  template <template <typename, int, QudaReconstructType> class Apply, typename Recon = ReconstructWilson, typename... Args>
+  template <template <typename, int, typename, QudaReconstructType> class Apply, typename Recon = ReconstructWilson,
+            typename... Args>
   void instantiate(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                    cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, const GaugeField &U,
                    Args... args)
diff --git a/lib/dslash_domain_wall_4d_fused_m5.in.cu b/lib/dslash_domain_wall_4d_fused_m5.in.cu
new file mode 100644
index 0000000000..df99e8f863
--- /dev/null
+++ b/lib/dslash_domain_wall_4d_fused_m5.in.cu
@@ -0,0 +1,24 @@
+#include <dslash_domain_wall_4d_fused_m5.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  constexpr Dslash5Type dslash5_type = Dslash5Type::@QUDA_DSLASH_DWTYPE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct DomainWall4DApplyFusedM5<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template DomainWall4DApplyFusedM5<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::DomainWall4DApplyFusedM5(
+    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
+    const GaugeField &U, cvector_ref<ColorSpinorField> &y, const Complex *b_5, const Complex *c_5, double a, double m_5,
+    int parity, bool dagger, const int *comm_override, double m_f, DistanceType<distance_pc>, Dslash5TypeList<dslash5_type>,
+    TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_domain_wall_4d_m5inv.cu b/lib/dslash_domain_wall_4d_m5inv.cu
index e739dddadb..b7b107c5fd 100644
--- a/lib/dslash_domain_wall_4d_m5inv.cu
+++ b/lib/dslash_domain_wall_4d_m5inv.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d_m5inv.hpp b/lib/dslash_domain_wall_4d_m5inv.hpp
new file mode 100644
index 0000000000..e739dddadb
--- /dev/null
+++ b/lib/dslash_domain_wall_4d_m5inv.hpp
@@ -0,0 +1,27 @@
+#include <dslash_domain_wall_4d_fused_m5.hpp>
+
+/**
+   This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
+*/
+
+namespace quda
+{
+
+  // Apply the 4-d preconditioned domain-wall Dslash operator
+  //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
+  // ... and then m5inv
+  void ApplyDomainWall4DM5inv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                              const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
+                              cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
+                              bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
+  {
+    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
+      instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
+                                            dummy_list, profile);
+    } else {
+      errorQuda("Domain-wall operator has not been built");
+    }
+  }
+
+} // namespace quda
diff --git a/lib/dslash_domain_wall_4d_m5inv.in.cu b/lib/dslash_domain_wall_4d_m5inv.in.cu
new file mode 100644
index 0000000000..e739dddadb
--- /dev/null
+++ b/lib/dslash_domain_wall_4d_m5inv.in.cu
@@ -0,0 +1,27 @@
+#include <dslash_domain_wall_4d_fused_m5.hpp>
+
+/**
+   This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
+*/
+
+namespace quda
+{
+
+  // Apply the 4-d preconditioned domain-wall Dslash operator
+  //   i.e. out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
+  // ... and then m5inv
+  void ApplyDomainWall4DM5inv(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                              const GaugeField &U, double a, double m_5, const Complex *b_5, const Complex *c_5,
+                              cvector_ref<const ColorSpinorField> &x, cvector_ref<ColorSpinorField> &y, int parity,
+                              bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
+  {
+    if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS>();
+      instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
+                                            dummy_list, profile);
+    } else {
+      errorQuda("Domain-wall operator has not been built");
+    }
+  }
+
+} // namespace quda
diff --git a/lib/dslash_domain_wall_4d_m5inv_m5inv.cu b/lib/dslash_domain_wall_4d_m5inv_m5inv.cu
index 870c311fb2..cd4b760a72 100644
--- a/lib/dslash_domain_wall_4d_m5inv_m5inv.cu
+++ b/lib/dslash_domain_wall_4d_m5inv_m5inv.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS_M5_INV_DAG>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d_m5inv_m5pre.cu b/lib/dslash_domain_wall_4d_m5inv_m5pre.cu
index fedfd154b9..414d26aebf 100644
--- a/lib/dslash_domain_wall_4d_m5inv_m5pre.cu
+++ b/lib/dslash_domain_wall_4d_m5inv_m5pre.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_INV_MOBIUS_M5_PRE>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d_m5mob.cu b/lib/dslash_domain_wall_4d_m5mob.cu
index 76527b9321..87f5804b0e 100644
--- a/lib/dslash_domain_wall_4d_m5mob.cu
+++ b/lib/dslash_domain_wall_4d_m5mob.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d_m5pre.cu b/lib/dslash_domain_wall_4d_m5pre.cu
index b9ea9dae29..f88e4ea6d4 100644
--- a/lib/dslash_domain_wall_4d_m5pre.cu
+++ b/lib/dslash_domain_wall_4d_m5pre.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                               bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS_PRE>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d_m5pre_m5inv.cu b/lib/dslash_domain_wall_4d_m5pre_m5inv.cu
index 33cb13c1b1..c4c708a102 100644
--- a/lib/dslash_domain_wall_4d_m5pre_m5inv.cu
+++ b/lib/dslash_domain_wall_4d_m5pre_m5inv.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::M5_PRE_MOBIUS_M5_INV>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_4d_m5pre_m5mob.cu b/lib/dslash_domain_wall_4d_m5pre_m5mob.cu
index 7ceabeec6a..017ddae244 100644
--- a/lib/dslash_domain_wall_4d_m5pre_m5mob.cu
+++ b/lib/dslash_domain_wall_4d_m5pre_m5mob.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_domain_wall_4d_fused_m5.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 4-d preconditioned operator, fused with immediately followed fifth dimension operators.
@@ -16,9 +18,10 @@ namespace quda
                                    bool dagger, const int *comm_override, double m_f, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_4D_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       auto dummy_list = Dslash5TypeList<Dslash5Type::DSLASH5_MOBIUS_PRE_M5_MOB>();
       instantiate<DomainWall4DApplyFusedM5>(out, in, x, y, U, b_5, c_5, a, m_5, parity, dagger, comm_override, m_f,
-                                            dummy_list, profile);
+                                            dummy, dummy_list, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_5d.cu b/lib/dslash_domain_wall_5d.cu
index 3dce0be391..3bb5f8f3a8 100644
--- a/lib/dslash_domain_wall_5d.cu
+++ b/lib/dslash_domain_wall_5d.cu
@@ -1,10 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_domain_wall_5d.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_domain_wall_5d.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged domain-wall 5-d preconditioned operator.
@@ -13,80 +9,16 @@
 namespace quda
 {
 
-  template <typename Arg> class DomainWall5D : public Dslash<domainWall5D, Arg>
-  {
-    using Dslash = Dslash<domainWall5D, Arg>;
-    using Dslash::arg;
-    using Dslash::in;
-
-  public:
-    DomainWall5D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      Dslash::template instantiate<packShmem>(tp, stream);
-    }
-
-    long long flops() const
-    {
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: {
-        int Ls = in.X(4);
-        long long bulk = (Ls - 2) * (in.Volume() / Ls);
-        long long wall = 2 * (in.Volume() / Ls);
-        flops += in.size() * 96ll * bulk + 120ll * wall;
-      } break;
-      default: break; // 5-d flops are in the interior kernel
-      }
-      return flops;
-    }
-
-    long long bytes() const
-    {
-      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
-      long long bytes = Dslash::bytes();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: bytes += in.size() * 2 * spinor_bytes * in.VolumeCB(); break;
-      default: break;
-      }
-      return bytes;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct DomainWall5DApply {
-
-    DomainWall5DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_f, int parity,
-                      bool dagger, const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 5;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      DomainWall5DArg<Float, nColor, nDim, recon> arg(out, in, halo, U, a, m_f, a != 0.0, x, parity, dagger,
-                                                      comm_override);
-      DomainWall5D<decltype(arg)> dwf(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, in, halo, profile);
-    }
-  };
 
-  // Apply the 4-d preconditioned domain-wall Dslash operator
+  // Apply the 5-d preconditioned domain-wall Dslash operator
   // out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
   void ApplyDomainWall5D(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                          const GaugeField &U, double a, double m_f, cvector_ref<const ColorSpinorField> &x, int parity,
                          bool dagger, const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_DOMAIN_WALL_DSLASH>()) {
-      instantiate<DomainWall5DApply>(out, in, x, U, a, m_f, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<DomainWall5DApply>(out, in, x, U, a, m_f, parity, dagger, comm_override, dummy, profile);
     } else {
       errorQuda("Domain-wall operator has not been built");
     }
diff --git a/lib/dslash_domain_wall_5d.hpp b/lib/dslash_domain_wall_5d.hpp
new file mode 100644
index 0000000000..bb030223a5
--- /dev/null
+++ b/lib/dslash_domain_wall_5d.hpp
@@ -0,0 +1,89 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_domain_wall_5d.cuh>
+
+/**
+   This is the gauged domain-wall 5-d preconditioned operator.
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class DomainWall5D : public Dslash<domainWall5D, Arg>
+  {
+    using Dslash = Dslash<domainWall5D, Arg>;
+    using Dslash::arg;
+    using Dslash::in;
+
+  public:
+    DomainWall5D(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                 const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      Dslash::template instantiate<packShmem>(tp, stream);
+    }
+
+    long long flops() const
+    {
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: {
+        int Ls = in.X(4);
+        long long bulk = (Ls - 2) * (in.Volume() / Ls);
+        long long wall = 2 * (in.Volume() / Ls);
+        flops += in.size() * 96ll * bulk + 120ll * wall;
+      } break;
+      default: break; // 5-d flops are in the interior kernel
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
+      long long bytes = Dslash::bytes();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: bytes += in.size() * 2 * spinor_bytes * in.VolumeCB(); break;
+      default: break;
+      }
+      return bytes;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct DomainWall5DApply {
+    template <bool distance_pc>
+    DomainWall5DApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_f, int parity,
+                      bool dagger, const int *comm_override,DistanceType<distance_pc>, TimeProfile &profile)
+    #ifdef SIGNATURE_ONLY
+          ;
+#else
+    {
+      constexpr int nDim = 5;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      DomainWall5DArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, m_f, a != 0.0, x, parity, dagger,
+                                                             comm_override);
+      DomainWall5D<decltype(arg)> dwf(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(dwf)> policy(dwf, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_domain_wall_5d.in.cu b/lib/dslash_domain_wall_5d.in.cu
new file mode 100644
index 0000000000..d0aa8b4710
--- /dev/null
+++ b/lib/dslash_domain_wall_5d.in.cu
@@ -0,0 +1,20 @@
+#include <dslash_domain_wall_5d.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  template struct DomainWall5DApply < Float, nColor, DDArg, ReconstructWilson::recon[reconI]>; 
+
+  template DomainWall5DApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::DomainWall5DApply
+	  (cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                      cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double m_f, int parity,
+                      bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile);
+} // namespace quda
diff --git a/lib/dslash_improved_staggered.cu b/lib/dslash_improved_staggered.cu
index dd3c3f4ce7..6f1d735e92 100644
--- a/lib/dslash_improved_staggered.cu
+++ b/lib/dslash_improved_staggered.cu
@@ -1,15 +1,6 @@
-#include <dslash.h>
-#include <worker.h>
-#include <dslash_helper.cuh>
-#include <color_spinor_field_order.h>
-#include <gauge_field_order.h>
-#include <color_spinor.h>
-#include <dslash_helper.cuh>
-#include <index_helper.cuh>
-#include <gauge_field.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_staggered.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_improved_staggered.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is a staggered Dirac operator
@@ -18,156 +9,6 @@
 namespace quda
 {
 
-  template <typename Arg> class Staggered : public Dslash<staggered, Arg>
-  {
-    using Dslash = Dslash<staggered, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-    const GaugeField &L;
-
-  public:
-    Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-              const ColorSpinorField &halo, const GaugeField &L) :
-      Dslash(arg, out, in, halo), L(L)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      // operator is anti-Hermitian so do not instantiate dagger
-      if (arg.nParity == 1) {
-        if (arg.xpay)
-          Dslash::template instantiate<packStaggeredShmem, 1, false, true>(tp, stream);
-        else
-          Dslash::template instantiate<packStaggeredShmem, 1, false, false>(tp, stream);
-      } else if (arg.nParity == 2) {
-        if (arg.xpay)
-          Dslash::template instantiate<packStaggeredShmem, 2, false, true>(tp, stream);
-        else
-          Dslash::template instantiate<packStaggeredShmem, 2, false, false>(tp, stream);
-      }
-    }
-
-    /*
-      per direction / dimension flops
-      SU(3) matrix-vector flops = (8 Nc - 2) * Nc
-      xpay = 2 * 2 * Nc * Ns
-
-      So for the full dslash we have
-      flops = (2 * 2 * Nd * (8*Nc-2) * Nc)  +  ((2 * 2 * Nd - 1) * 2 * Nc * Ns)
-      flops_xpay = flops + 2 * 2 * Nc * Ns
-
-      For Asqtad this should give 1146 for Nc=3,Ns=2 and 1158 for the axpy equivalent
-    */
-    long long flops() const
-    {
-      int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
-      int ghost_flops = (3 + 1) * (mv_flops + 2 * in.Ncolor() * in.Nspin());
-      int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
-      int num_dir = 2 * 4;                               // hard code factor of 4 in direction since fields may be 5-d
-
-      long long flops_ = 0;
-
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T: flops_ = ghost_flops * 2 * halo.GhostFace()[arg.kernel_type]; break;
-      case EXTERIOR_KERNEL_ALL: {
-        long long ghost_sites
-          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        flops_ = ghost_flops * ghost_sites;
-        break;
-      }
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: {
-        long long sites = halo.Volume();
-        flops_ = (2 * num_dir * mv_flops + // SU(3) matrix-vector multiplies
-                  (2 * num_dir - 1) * 2 * in.Ncolor() * in.Nspin())
-          * sites;                                  // accumulation
-        if (arg.xpay) flops_ += xpay_flops * sites; // axpy is always on interior
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for flops done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        flops_ -= ghost_flops * ghost_sites;
-
-        break;
-      }
-      }
-      return flops_;
-    }
-
-    long long bytes() const
-    {
-      int gauge_bytes_fat = QUDA_RECONSTRUCT_NO * in.Precision();
-      int gauge_bytes_long = L.Reconstruct() * in.Precision();
-      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
-      int ghost_bytes = 3 * (spinor_bytes + gauge_bytes_long) + (spinor_bytes + gauge_bytes_fat)
-        + 3 * 2 * spinor_bytes; // last term is the accumulator load/store through the face
-      int num_dir = 2 * 4;      // set to 4-d since we take care of 5-d fermions in derived classes where necessary
-
-      long long bytes_ = 0;
-
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * halo.GhostFace()[arg.kernel_type]; break;
-      case EXTERIOR_KERNEL_ALL: {
-        long long ghost_sites
-          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        bytes_ = ghost_bytes * ghost_sites;
-        break;
-      }
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: {
-        long long sites = halo.Volume();
-        bytes_ = (num_dir * (gauge_bytes_fat + gauge_bytes_long) + // gauge reads
-                  num_dir * 2 * spinor_bytes +                     // spinor reads
-                  spinor_bytes)
-          * sites; // spinor write
-        if (arg.xpay) bytes_ += spinor_bytes;
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for bytes done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        bytes_ -= ghost_bytes * ghost_sites;
-
-        break;
-      }
-      }
-      return bytes_;
-    }
-
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon_l> struct ImprovedStaggeredApply {
-
-    ImprovedStaggeredApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                           cvector_ref<const ColorSpinorField> &x, const GaugeField &L, const GaugeField &U, double a,
-                           int parity, bool dagger, const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      constexpr bool improved = true;
-      constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
-      auto halo = ColorSpinorField::create_comms_batch(in, 3);
-      StaggeredArg<Float, nColor, nDim, recon_u, recon_l, improved> arg(out, in, halo, U, L, a, x, parity, dagger,
-                                                                        comm_override);
-      Staggered<decltype(arg)> staggered(arg, out, in, halo, L);
-      dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
-    }
-  };
-
   void ApplyImprovedStaggered(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                               const GaugeField &U, const GaugeField &L, double a, cvector_ref<const ColorSpinorField> &x,
                               int parity, bool dagger, const int *comm_override, TimeProfile &profile)
@@ -178,9 +19,9 @@ namespace quda
           errorQuda("partitioned dimension with local size less than 6 is not supported in improved staggered dslash");
         }
       }
-
+      auto dummy = DistanceType<false>();
       // L must be first gauge field argument since we template on long reconstruct
-      instantiate<ImprovedStaggeredApply, ReconstructStaggered>(out, in, x, L, U, a, parity, dagger, comm_override,
+      instantiate<ImprovedStaggeredApply, ReconstructStaggered>(out, in, x, L, U, a, parity, dagger, comm_override, dummy, 
                                                                 profile);
     } else {
       errorQuda("Improved staggered operator has not been built");
diff --git a/lib/dslash_improved_staggered.hpp b/lib/dslash_improved_staggered.hpp
new file mode 100644
index 0000000000..bd54112dea
--- /dev/null
+++ b/lib/dslash_improved_staggered.hpp
@@ -0,0 +1,179 @@
+#include <dslash.h>
+#include <worker.h>
+#include <dslash_helper.cuh>
+#include <color_spinor_field_order.h>
+#include <gauge_field_order.h>
+#include <color_spinor.h>
+#include <dslash_helper.cuh>
+#include <index_helper.cuh>
+#include <gauge_field.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_staggered.cuh>
+
+/**
+   This is a staggered Dirac operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class Staggered : public Dslash<staggered, Arg>
+  {
+    using Dslash = Dslash<staggered, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+    const GaugeField &L;
+
+  public:
+    Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+              const ColorSpinorField &halo, const GaugeField &L) :
+      Dslash(arg, out, in, halo), L(L)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      // operator is anti-Hermitian so do not instantiate dagger
+      if (arg.nParity == 1) {
+        if (arg.xpay)
+          Dslash::template instantiate<packStaggeredShmem, 1, false, true>(tp, stream);
+        else
+          Dslash::template instantiate<packStaggeredShmem, 1, false, false>(tp, stream);
+      } else if (arg.nParity == 2) {
+        if (arg.xpay)
+          Dslash::template instantiate<packStaggeredShmem, 2, false, true>(tp, stream);
+        else
+          Dslash::template instantiate<packStaggeredShmem, 2, false, false>(tp, stream);
+      }
+    }
+
+    /*
+      per direction / dimension flops
+      SU(3) matrix-vector flops = (8 Nc - 2) * Nc
+      xpay = 2 * 2 * Nc * Ns
+
+      So for the full dslash we have
+      flops = (2 * 2 * Nd * (8*Nc-2) * Nc)  +  ((2 * 2 * Nd - 1) * 2 * Nc * Ns)
+      flops_xpay = flops + 2 * 2 * Nc * Ns
+
+      For Asqtad this should give 1146 for Nc=3,Ns=2 and 1158 for the axpy equivalent
+    */
+    long long flops() const
+    {
+      int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
+      int ghost_flops = (3 + 1) * (mv_flops + 2 * in.Ncolor() * in.Nspin());
+      int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
+      int num_dir = 2 * 4;                               // hard code factor of 4 in direction since fields may be 5-d
+
+      long long flops_ = 0;
+
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T: flops_ = ghost_flops * 2 * halo.GhostFace()[arg.kernel_type]; break;
+      case EXTERIOR_KERNEL_ALL: {
+        long long ghost_sites
+          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        flops_ = ghost_flops * ghost_sites;
+        break;
+      }
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: {
+        long long sites = halo.Volume();
+        flops_ = (2 * num_dir * mv_flops + // SU(3) matrix-vector multiplies
+                  (2 * num_dir - 1) * 2 * in.Ncolor() * in.Nspin())
+          * sites;                                  // accumulation
+        if (arg.xpay) flops_ += xpay_flops * sites; // axpy is always on interior
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for flops done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        flops_ -= ghost_flops * ghost_sites;
+
+        break;
+      }
+      }
+      return flops_;
+    }
+
+    long long bytes() const
+    {
+      int gauge_bytes_fat = QUDA_RECONSTRUCT_NO * in.Precision();
+      int gauge_bytes_long = L.Reconstruct() * in.Precision();
+      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
+      int ghost_bytes = 3 * (spinor_bytes + gauge_bytes_long) + (spinor_bytes + gauge_bytes_fat)
+        + 3 * 2 * spinor_bytes; // last term is the accumulator load/store through the face
+      int num_dir = 2 * 4;      // set to 4-d since we take care of 5-d fermions in derived classes where necessary
+
+      long long bytes_ = 0;
+
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * halo.GhostFace()[arg.kernel_type]; break;
+      case EXTERIOR_KERNEL_ALL: {
+        long long ghost_sites
+          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        bytes_ = ghost_bytes * ghost_sites;
+        break;
+      }
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: {
+        long long sites = halo.Volume();
+        bytes_ = (num_dir * (gauge_bytes_fat + gauge_bytes_long) + // gauge reads
+                  num_dir * 2 * spinor_bytes +                     // spinor reads
+                  spinor_bytes)
+          * sites; // spinor write
+        if (arg.xpay) bytes_ += spinor_bytes;
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for bytes done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        bytes_ -= ghost_bytes * ghost_sites;
+
+        break;
+      }
+      }
+      return bytes_;
+    }
+
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon_l> struct ImprovedStaggeredApply {
+    template <bool distance_pc>
+    ImprovedStaggeredApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                           cvector_ref<const ColorSpinorField> &x, const GaugeField &L, const GaugeField &U, double a,
+                           int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      constexpr bool improved = true;
+      constexpr QudaReconstructType recon_u = QUDA_RECONSTRUCT_NO;
+      auto halo = ColorSpinorField::create_comms_batch(in, 3);
+      StaggeredArg<Float, nColor, nDim, DDArg, recon_u, recon_l, improved> arg(out, in, halo, U, L, a, x, parity,
+                                                                               dagger, comm_override);
+      Staggered<decltype(arg)> staggered(arg, out, in, halo, L);
+      dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_improved_staggered.in.cu b/lib/dslash_improved_staggered.in.cu
new file mode 100644
index 0000000000..5fe8ed3490
--- /dev/null
+++ b/lib/dslash_improved_staggered.in.cu
@@ -0,0 +1,22 @@
+#include <dslash_improved_staggered.hpp>
+  
+namespace quda
+{
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+
+
+  template struct ImprovedStaggeredApply<Float, nColor, DDArg, ReconstructStaggered::recon[reconI]>;
+ 
+  template ImprovedStaggeredApply<Float, nColor, DDArg, ReconstructStaggered::recon[reconI]>::ImprovedStaggeredApply(
+		  cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                  cvector_ref<const ColorSpinorField> &x, const GaugeField &L, const GaugeField &U, double a,
+                  int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_mdw_fused_impl.hpp b/lib/dslash_mdw_fused_impl.hpp
index 28c9be93c5..b873da054b 100644
--- a/lib/dslash_mdw_fused_impl.hpp
+++ b/lib/dslash_mdw_fused_impl.hpp
@@ -21,7 +21,7 @@ namespace quda
 
 #ifdef QUDA_MMA_AVAILABLE
 
-    template <class store_t, int nColor, QudaReconstructType recon, int Ls_> class FusedDslash : public TunableGridStrideKernel2D
+    template <class store_t, int nColor, typename DDArg, QudaReconstructType recon, int Ls_> class FusedDslash : public TunableGridStrideKernel2D
     {
       ColorSpinorField &out;
       const ColorSpinorField &in;
@@ -257,8 +257,8 @@ namespace quda
 #if defined(GPU_DOMAIN_WALL_DIRAC) && defined(QUDA_MMA_AVAILABLE)
     template <int Ls>
     struct FusedDslashLs {
-      template <class store_t, int nColor, QudaReconstructType recon>
-      using type = FusedDslash<store_t, nColor, recon, Ls>;
+      template <class store_t, int nColor, typename DDArg, QudaReconstructType recon>
+      using type = FusedDslash<store_t, nColor, DDArg, recon, Ls>;
     };
 #endif
   } // namespace mobius_tensor_core
diff --git a/lib/dslash_ndeg_twisted_clover.cu b/lib/dslash_ndeg_twisted_clover.cu
index 30809f2eda..04e9bb94f1 100644
--- a/lib/dslash_ndeg_twisted_clover.cu
+++ b/lib/dslash_ndeg_twisted_clover.cu
@@ -1,11 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <clover_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_ndeg_twisted_clover.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_ndeg_twisted_clover.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged non-degenerate twisted-clover operator acting on a 
@@ -15,91 +10,17 @@
 namespace quda
 {
 
-  template <typename Arg> class NdegTwistedClover : public Dslash<nDegTwistedClover, Arg>
+    void ApplyNdegTwistedClover(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                const GaugeField &U, const CloverField &A, double a, double b, double c,
+                                cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
+                                const int *comm_override, TimeProfile &profile)
     {
-      using Dslash = Dslash<nDegTwistedClover, Arg>;
-      using Dslash::arg;
-      using Dslash::halo;
-      using Dslash::in;
-
-      unsigned int sharedBytesPerThread() const
-      {
-        return 2 * in.Ncolor() * 4 * sizeof(typename mapper<typename Arg::Float>::type);
-      }
-
-    public:
-      NdegTwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                        const ColorSpinorField &halo) :
-        Dslash(arg, out, in, halo)
-      {
-        TunableKernel3D::resizeStep(2, 1);
-      }
-
-      void apply(const qudaStream_t &stream)
-      {
-        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-        Dslash::setParam(tp);
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, true>(tp, stream);
-        else
-          errorQuda("Non-degenerate twisted-clover operator only defined for xpay=true");
+      if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+        auto dummy = DistanceType<false>();
+        instantiate<NdegTwistedCloverApply>(out, in, x, U, A, a, b, c, parity, dagger, comm_override, dummy, profile);
+      } else {
+        errorQuda("Non-degenerate twisted-clover operator has not been built");
       }
-      
-      long long flops() const
-      {
-        int clover_flops = 504;
-        long long flops = Dslash::flops();
-        switch (arg.kernel_type) {
-        case INTERIOR_KERNEL:
-        case KERNEL_POLICY:
-          // b and c multiply (= 2 * 48 * in.Volume())
-          flops += 2 * in.Ncolor() * 4 * 4 * halo.Volume(); // complex * Nc * Ns * fma * vol
-          flops += clover_flops * halo.Volume();
-          break;
-        default: break; // twisted-mass flops are in the interior kernel
-        }
-        return flops;
-      }
-      long long bytes() const
-      {
-        int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-        
-        long long bytes = Dslash::bytes();
-        switch (arg.kernel_type) {
-        case INTERIOR_KERNEL:
-        case KERNEL_POLICY: bytes += clover_bytes * halo.Volume(); break;
-        default: break;
-        }
-        
-        return bytes;
-      }
-    };
-  
-  template <typename Float, int nColor, QudaReconstructType recon> struct NdegTwistedCloverApply {
-
-    NdegTwistedCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                           cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
-                           double b, double c, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      NdegTwistedCloverArg<Float, nColor, nDim, recon> arg(out, in, halo, U, A, a, b, c, x, parity, dagger,
-                                                           comm_override);
-      NdegTwistedClover<decltype(arg)> twisted(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
     }
-  };
 
-  void ApplyNdegTwistedClover(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                              const GaugeField &U, const CloverField &A, double a, double b, double c,
-                              cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, const int *comm_override,
-                              TimeProfile &profile)
-  {
-    if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
-      instantiate<NdegTwistedCloverApply>(out, in, x, U, A, a, b, c, parity, dagger, comm_override, profile);
-    } else {
-      errorQuda("Non-degenerate twisted-clover operator has not been built");
-    }
-  }
-  
 } // namespace quda
diff --git a/lib/dslash_ndeg_twisted_clover.hpp b/lib/dslash_ndeg_twisted_clover.hpp
new file mode 100644
index 0000000000..285aab6c3c
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_clover.hpp
@@ -0,0 +1,100 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <clover_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_ndeg_twisted_clover.cuh>
+
+/**
+   This is the gauged non-degenerate twisted-clover operator acting on a 
+   quark doublet.
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class NdegTwistedClover : public Dslash<nDegTwistedClover, Arg>
+    {
+      using Dslash = Dslash<nDegTwistedClover, Arg>;
+      using Dslash::arg;
+      using Dslash::halo;
+      using Dslash::in;
+
+      unsigned int sharedBytesPerThread() const
+      {
+        return 2 * in.Ncolor() * 4 * sizeof(typename mapper<typename Arg::Float>::type);
+      }
+
+    public:
+      NdegTwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                        const ColorSpinorField &halo) :
+        Dslash(arg, out, in, halo)
+      {
+        TunableKernel3D::resizeStep(2, 1);
+      }
+
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        Dslash::setParam(tp);
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, true>(tp, stream);
+        else
+          errorQuda("Non-degenerate twisted-clover operator only defined for xpay=true");
+      }
+      
+      long long flops() const
+      {
+        int clover_flops = 504;
+        long long flops = Dslash::flops();
+        switch (arg.kernel_type) {
+        case INTERIOR_KERNEL:
+        case KERNEL_POLICY:
+          // b and c multiply (= 2 * 48 * in.Volume())
+          flops += 2 * in.Ncolor() * 4 * 4 * halo.Volume(); // complex * Nc * Ns * fma * vol
+          flops += clover_flops * halo.Volume();
+          break;
+        default: break; // twisted-mass flops are in the interior kernel
+        }
+        return flops;
+      }
+      long long bytes() const
+      {
+        int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+        
+        long long bytes = Dslash::bytes();
+        switch (arg.kernel_type) {
+        case INTERIOR_KERNEL:
+        case KERNEL_POLICY: bytes += clover_bytes * halo.Volume(); break;
+        default: break;
+        }
+        
+        return bytes;
+      }
+    };
+
+    template <bool distance_pc> struct DistanceType {
+    };
+
+    template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct NdegTwistedCloverApply {
+      template <bool distance_pc>
+      NdegTwistedCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                             cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
+                             double b, double c, int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>,TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+      {
+        constexpr int nDim = 4;
+        auto halo = ColorSpinorField::create_comms_batch(in);
+        NdegTwistedCloverArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, c, x, parity, dagger,
+                                                                    comm_override);
+        NdegTwistedClover<decltype(arg)> twisted(arg, out, in, halo);
+        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      }
+#endif
+    };
+
+} // namespace quda
diff --git a/lib/dslash_ndeg_twisted_clover.in.cu b/lib/dslash_ndeg_twisted_clover.in.cu
new file mode 100644
index 0000000000..f836f3d455
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_clover.in.cu
@@ -0,0 +1,20 @@
+#include <dslash_ndeg_twisted_clover.hpp>
+
+namespace quda
+{
+
+    constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+    constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+    constexpr int reconI = @QUDA_DSLASH_RECONI@;
+    constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+    typedef @QUDA_DSLASH_DDARG@ DDArg;
+    typedef precision_type_mapper<precision>::type Float;
+
+    template struct NdegTwistedCloverApply<Float,nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+    template NdegTwistedCloverApply<Float,nColor, DDArg, ReconstructWilson::recon[reconI]>::NdegTwistedCloverApply(
+	    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+            cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
+	    double b, double c, int parity, bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile);
+} // namespace quda
diff --git a/lib/dslash_ndeg_twisted_clover_preconditioned.cu b/lib/dslash_ndeg_twisted_clover_preconditioned.cu
index 71c161804b..b01ff7ee77 100644
--- a/lib/dslash_ndeg_twisted_clover_preconditioned.cu
+++ b/lib/dslash_ndeg_twisted_clover_preconditioned.cu
@@ -1,11 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <clover_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_ndeg_twisted_clover_preconditioned.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_ndeg_twisted_clover_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged preconditioned twisted-clover operator 
@@ -14,114 +9,20 @@
 
 namespace quda
 {
-  template <typename Arg> class NdegTwistedCloverPreconditioned : public Dslash<nDegTwistedCloverPreconditioned, Arg>
-    {
-      using Dslash = Dslash<nDegTwistedCloverPreconditioned, Arg>;
-      using Dslash::arg;
-      using Dslash::halo;
-      using Dslash::in;
 
-      unsigned int sharedBytesPerThread() const
-      {
-        return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<typename Arg::Float>::type);
-      }
-      
-    public:
-      NdegTwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                      cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-        Dslash(arg, out, in, halo)
-      {
-        TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
-      }
-      
-      void apply(const qudaStream_t &stream)
-      {
-        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-        Dslash::setParam(tp);
-        if (arg.nParity != 1) errorQuda("Preconditioned non-degenerate twisted-clover operator not defined nParity=%d", arg.nParity);
-       
-        if (arg.xpay){
-          if (arg.dagger) errorQuda("xpay operator not only defined for not dagger");
-          Dslash::template instantiate<packShmem, 1, false, true>(tp, stream);
-        } else {
-          if (arg.dagger)
-            Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
-          else
-            Dslash::template instantiate<packShmem, 1, false, false>(tp, stream);
-        }
-      }
-
-      void initTuneParam(TuneParam &param) const
-      {
-        Dslash::initTuneParam(param);
-        param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
-      }
-      
-      void defaultTuneParam(TuneParam &param) const
-      {
-        Dslash::defaultTuneParam(param);
-        param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
-      }
-      
-      long long flops() const
-      {
-        int clover_flops = 504;
-        long long flops = Dslash::flops();
-        switch (arg.kernel_type) {
-        case INTERIOR_KERNEL:
-        case KERNEL_POLICY:
-          // b and c multiply (= 2 * 48 * in.Volume())
-          flops += 2 * in.Ncolor() * 4 * 4 * halo.Volume(); // complex * Nc * Ns * fma * vol
-          flops += clover_flops * halo.Volume();
-          break;
-        default: break; // twisted-mass flops are in the interior kernel
-        }
-        return flops;
-      }
-      
-      long long bytes() const
-      {
-        int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-        
-        long long bytes = Dslash::bytes();
-        switch (arg.kernel_type) {
-        case INTERIOR_KERNEL:
-        case KERNEL_POLICY: bytes += clover_bytes * halo.Volume(); break;
-        default: break;
-        }
-        
-        return bytes;
-      }
-    };
-  
-  template <typename Float, int nColor, QudaReconstructType recon> struct NdegTwistedCloverPreconditionedApply {
-
-    NdegTwistedCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                         cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
-                                         const CloverField &A, double a, double b, double c, bool xpay, int parity,
-                                         bool dagger, const int *comm_override, TimeProfile &profile)
+    void ApplyNdegTwistedCloverPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                              const GaugeField &U, const CloverField &A, double a, double b, double c,
+                                              bool xpay, cvector_ref<const ColorSpinorField> &x, int parity,
+                                              bool dagger, const int *comm_override, TimeProfile &profile)
     {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      NdegTwistedCloverPreconditionedArg<Float, nColor, nDim, recon> arg(out, in, halo, U, A, a, b, c, xpay, x, parity,
-                                                                         dagger, comm_override);
-      NdegTwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
+        auto dummy = DistanceType<false>();
+        instantiate<NdegTwistedCloverPreconditionedApply>(out, in, x, U, A, a, b, c, xpay, parity, dagger,
+                                                          comm_override, dummy, profile);
+      } else {
+        errorQuda("Non-degenerate preconditioned twisted-clover operator has not been built");
+      }
     }
-  };
 
-  void ApplyNdegTwistedCloverPreconditioned(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                            const GaugeField &U, const CloverField &A, double a, double b, double c,
-                                            bool xpay, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
-                                            const int *comm_override, TimeProfile &profile)
-  {
-    if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
-      instantiate<NdegTwistedCloverPreconditionedApply>(out, in, x, U, A, a, b, c, xpay, parity, dagger, comm_override,
-                                                        profile);
-    } else {
-      errorQuda("Non-degenerate preconditioned twisted-clover operator has not been built");
-    }
-  }
-  
 } // namespace quda
 
diff --git a/lib/dslash_ndeg_twisted_clover_preconditioned.hpp b/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
new file mode 100644
index 0000000000..a2763a5bca
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_clover_preconditioned.hpp
@@ -0,0 +1,124 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <clover_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_ndeg_twisted_clover_preconditioned.cuh>
+
+/**
+   This is the gauged preconditioned twisted-clover operator 
+   acting on a non-degenerate quark doublet.
+*/
+
+namespace quda
+{
+  template <typename Arg> class NdegTwistedCloverPreconditioned : public Dslash<nDegTwistedCloverPreconditioned, Arg>
+    {
+      using Dslash = Dslash<nDegTwistedCloverPreconditioned, Arg>;
+      using Dslash::arg;
+      using Dslash::halo;
+      using Dslash::in;
+
+      unsigned int sharedBytesPerThread() const
+      {
+        return (in.Nspin() / 2) * in.Ncolor() * 2 * sizeof(typename mapper<typename Arg::Float>::type);
+      }
+      
+    public:
+      NdegTwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out,
+                                      cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
+        Dslash(arg, out, in, halo)
+      {
+        TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
+      }
+      
+      void apply(const qudaStream_t &stream)
+      {
+        TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+        Dslash::setParam(tp);
+        if (arg.nParity != 1) errorQuda("Preconditioned non-degenerate twisted-clover operator not defined nParity=%d", arg.nParity);
+       
+        if (arg.xpay){
+          if (arg.dagger) errorQuda("xpay operator not only defined for not dagger");
+          Dslash::template instantiate<packShmem, 1, false, true>(tp, stream);
+        } else {
+          if (arg.dagger)
+            Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
+          else
+            Dslash::template instantiate<packShmem, 1, false, false>(tp, stream);
+        }
+      }
+
+      void initTuneParam(TuneParam &param) const
+      {
+        Dslash::initTuneParam(param);
+        param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
+      }
+      
+      void defaultTuneParam(TuneParam &param) const
+      {
+        Dslash::defaultTuneParam(param);
+        param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
+      }
+      
+      long long flops() const
+      {
+        int clover_flops = 504;
+        long long flops = Dslash::flops();
+        switch (arg.kernel_type) {
+        case INTERIOR_KERNEL:
+        case KERNEL_POLICY:
+          // b and c multiply (= 2 * 48 * in.Volume())
+          flops += 2 * in.Ncolor() * 4 * 4 * halo.Volume(); // complex * Nc * Ns * fma * vol
+          flops += clover_flops * halo.Volume();
+          break;
+        default: break; // twisted-mass flops are in the interior kernel
+        }
+        return flops;
+      }
+      
+      long long bytes() const
+      {
+        int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+        
+        long long bytes = Dslash::bytes();
+        switch (arg.kernel_type) {
+        case INTERIOR_KERNEL:
+        case KERNEL_POLICY: bytes += clover_bytes * halo.Volume(); break;
+        default: break;
+        }
+        
+        return bytes;
+      }
+    };
+
+
+    template <bool distance_pc> struct DistanceType {
+    };
+
+
+    template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+    struct NdegTwistedCloverPreconditionedApply {
+      template <bool distance_pc>
+      NdegTwistedCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                           cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
+                                           const CloverField &A, double a, double b, double c, bool xpay, int parity,
+                                           bool dagger, const int *comm_override,DistanceType<distance_pc>,TimeProfile &profile)
+
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+      {
+        constexpr int nDim = 4;
+        auto halo = ColorSpinorField::create_comms_batch(in);
+        NdegTwistedCloverPreconditionedArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, c, xpay, x,
+                                                                                  parity, dagger, comm_override);
+        NdegTwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      }
+#endif
+    };
+} // namespace quda
+
diff --git a/lib/dslash_ndeg_twisted_clover_preconditioned.in.cu b/lib/dslash_ndeg_twisted_clover_preconditioned.in.cu
new file mode 100644
index 0000000000..e216de6138
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_clover_preconditioned.in.cu
@@ -0,0 +1,24 @@
+#include <dslash_ndeg_twisted_clover_preconditioned.hpp>
+
+namespace quda
+{
+
+    constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+    constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+    constexpr int reconI = @QUDA_DSLASH_RECONI@;
+    constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+    typedef @QUDA_DSLASH_DDARG@ DDArg;
+    typedef precision_type_mapper<precision>::type Float;
+
+    template struct NdegTwistedCloverPreconditionedApply<Float,nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+
+    template NdegTwistedCloverPreconditionedApply<Float,nColor, DDArg, ReconstructWilson::recon[reconI]>::NdegTwistedCloverPreconditionedApply(
+		    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
+                    const CloverField &A, double a, double b, double c, bool xpay, int parity,
+                    bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile);
+    
+} // namespace quda
+
diff --git a/lib/dslash_ndeg_twisted_mass.cu b/lib/dslash_ndeg_twisted_mass.cu
index 6e363e1e91..2c0bda1fe1 100644
--- a/lib/dslash_ndeg_twisted_mass.cu
+++ b/lib/dslash_ndeg_twisted_mass.cu
@@ -1,10 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_ndeg_twisted_mass.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_ndeg_twisted_mass.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the gauged twisted-mass operator acting on a non-generate
@@ -14,64 +10,13 @@
 namespace quda
 {
 
-  template <typename Arg> class NdegTwistedMass : public Dslash<nDegTwistedMass, Arg>
-  {
-    using Dslash = Dslash<nDegTwistedMass, Arg>;
-    using Dslash::arg;
-    using Dslash::in;
-
-  public:
-    NdegTwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                    const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      if (arg.xpay)
-        Dslash::template instantiate<packShmem, true>(tp, stream);
-      else
-        errorQuda("Non-degenerate twisted-mass operator only defined for xpay=true");
-    }
-
-    long long flops() const
-    {
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += in.size() * 2 * in.Ncolor() * 4 * 4 * in.Volume(); // complex * Nc * Ns * fma * vol
-        break;
-      default: break; // twisted-mass flops are in the interior kernel
-      }
-      return flops;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct NdegTwistedMassApply {
-
-    NdegTwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                         cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, double c,
-                         int parity, bool dagger, const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      NdegTwistedMassArg<Float, nColor, nDim, recon> arg(out, in, halo, U, a, b, c, x, parity, dagger, comm_override);
-      NdegTwistedMass<decltype(arg)> twisted(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-    }
-  };
-
   void ApplyNdegTwistedMass(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                             const GaugeField &U, double a, double b, double c, cvector_ref<const ColorSpinorField> &x,
                             int parity, bool dagger, const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
-      instantiate<NdegTwistedMassApply>(out, in, x, U, a, b, c, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<NdegTwistedMassApply>(out, in, x, U, a, b, c, parity, dagger, comm_override,dummy,profile);
     } else {
       errorQuda("Non-degenerate twisted-mass operator has not been built");
     }
diff --git a/lib/dslash_ndeg_twisted_mass.hpp b/lib/dslash_ndeg_twisted_mass.hpp
new file mode 100644
index 0000000000..432b73791a
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_mass.hpp
@@ -0,0 +1,77 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_ndeg_twisted_mass.cuh>
+
+/**
+   This is the gauged twisted-mass operator acting on a non-generate
+   quark doublet.
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class NdegTwistedMass : public Dslash<nDegTwistedMass, Arg>
+  {
+    using Dslash = Dslash<nDegTwistedMass, Arg>;
+    using Dslash::arg;
+    using Dslash::in;
+
+  public:
+    NdegTwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                    const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      if (arg.xpay)
+        Dslash::template instantiate<packShmem, true>(tp, stream);
+      else
+        errorQuda("Non-degenerate twisted-mass operator only defined for xpay=true");
+    }
+
+    long long flops() const
+    {
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += in.size() * 2 * in.Ncolor() * 4 * 4 * in.Volume(); // complex * Nc * Ns * fma * vol
+        break;
+      default: break; // twisted-mass flops are in the interior kernel
+      }
+      return flops;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct NdegTwistedMassApply {
+    template <bool distance_pc>
+    NdegTwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                         cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, double c,
+                         int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, b, c, x, parity, dagger,
+                                                                comm_override);
+      NdegTwistedMass<decltype(arg)> twisted(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+    }
+#endif
+  };
+} // namespace quda
diff --git a/lib/dslash_ndeg_twisted_mass.in.cu b/lib/dslash_ndeg_twisted_mass.in.cu
new file mode 100644
index 0000000000..783482a945
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_mass.in.cu
@@ -0,0 +1,20 @@
+#include <dslash_ndeg_twisted_mass.hpp>
+  
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct NdegTwistedMassApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template NdegTwistedMassApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::NdegTwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                         cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, double c,
+                         int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_ndeg_twisted_mass_preconditioned.cu b/lib/dslash_ndeg_twisted_mass_preconditioned.cu
index 8acd16a730..9af7437df4 100644
--- a/lib/dslash_ndeg_twisted_mass_preconditioned.cu
+++ b/lib/dslash_ndeg_twisted_mass_preconditioned.cu
@@ -1,10 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_ndeg_twisted_mass_preconditioned.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_ndeg_twisted_mass_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the preconditioned twisted-mass operator acting on a non-generate
@@ -14,106 +10,6 @@
 namespace quda
 {
 
-  // trait to ensure we don't instantiate asymmetric & xpay
-  template <bool symmetric> constexpr bool xpay_() { return true; }
-  template <> constexpr bool xpay_<true>() { return false; }
-
-  // trait to ensure we don't instantiate asymmetric & !dagger
-  template <bool symmetric> constexpr bool not_dagger_() { return false; }
-  template <> constexpr bool not_dagger_<true>() { return true; }
-
-  template <typename Arg> class NdegTwistedMassPreconditioned : public Dslash<nDegTwistedMassPreconditioned, Arg>
-  {
-    using Dslash = Dslash<nDegTwistedMassPreconditioned, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-
-  protected:
-    bool shared;
-    unsigned int sharedBytesPerThread() const
-    {
-      return shared ? 2 * in.Ncolor() * 4 * sizeof(typename mapper<typename Arg::Float>::type) : 0;
-    }
-
-  public:
-    NdegTwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                  const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo), shared(arg.asymmetric || !arg.dagger)
-    {
-      if (shared) TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
-      if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
-      if (arg.nParity != 1) errorQuda("Preconditioned non-degenerate twisted-mass operator not defined nParity=%d", arg.nParity);
-
-      if (arg.dagger) {
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, 1, true, xpay_<Arg::asymmetric>()>(tp, stream);
-        else
-          Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
-      } else {
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), xpay_<Arg::asymmetric>()>(tp, stream);
-        else
-          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), false>(tp, stream);
-      }
-    }
-
-    void initTuneParam(TuneParam &param) const
-    {
-      Dslash::initTuneParam(param);
-      if (shared) param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
-    }
-
-    void defaultTuneParam(TuneParam &param) const
-    {
-      Dslash::defaultTuneParam(param);
-      if (shared) param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
-    }
-
-    long long flops() const
-    {
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += 2 * in.Ncolor() * 4 * 4 * halo.Volume(); // complex * Nc * Ns * fma * vol
-        break;
-      default: break; // twisted-mass flops are in the interior kernel
-      }
-      return flops;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct NdegTwistedMassPreconditionedApply {
-
-    NdegTwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
-                                       double c, bool xpay, int parity, bool dagger, bool asymmetric,
-                                       const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      if (asymmetric) {
-        NdegTwistedMassArg<Float, nColor, nDim, recon, true> arg(out, in, halo, U, a, b, c, xpay, x, parity, dagger,
-                                                                 comm_override);
-        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
-        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-      } else {
-        NdegTwistedMassArg<Float, nColor, nDim, recon, false> arg(out, in, halo, U, a, b, c, xpay, x, parity, dagger,
-                                                                  comm_override);
-        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
-        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-      }
-    }
-  };
 
   // Apply the non-degenerate twisted-mass Dslash operator
   // out(x) = M*in = a*(1 + i*b*gamma_5*tau_3 + c*tau_1)*D + x
@@ -124,8 +20,9 @@ namespace quda
                                           bool asymmetric, const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
+      auto dummy = DistanceType<false>();
       instantiate<NdegTwistedMassPreconditionedApply>(out, in, x, U, a, b, c, xpay, parity, dagger, asymmetric,
-                                                      comm_override, profile);
+                                                      comm_override, dummy, profile);
     } else {
       errorQuda("Non-degenerate preconditioned twisted-mass dslash has not been built");
     }
diff --git a/lib/dslash_ndeg_twisted_mass_preconditioned.hpp b/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
new file mode 100644
index 0000000000..47e3029965
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_mass_preconditioned.hpp
@@ -0,0 +1,126 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_ndeg_twisted_mass_preconditioned.cuh>
+
+/**
+   This is the preconditioned twisted-mass operator acting on a non-generate
+   quark doublet.
+*/
+
+namespace quda
+{
+
+  // trait to ensure we don't instantiate asymmetric & xpay
+  template <bool symmetric> constexpr bool xpay_() { return true; }
+  template <> constexpr bool xpay_<true>() { return false; }
+
+  // trait to ensure we don't instantiate asymmetric & !dagger
+  template <bool symmetric> constexpr bool not_dagger_() { return false; }
+  template <> constexpr bool not_dagger_<true>() { return true; }
+
+  template <typename Arg> class NdegTwistedMassPreconditioned : public Dslash<nDegTwistedMassPreconditioned, Arg>
+  {
+    using Dslash = Dslash<nDegTwistedMassPreconditioned, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+
+  protected:
+    bool shared;
+    unsigned int sharedBytesPerThread() const
+    {
+      return shared ? 2 * in.Ncolor() * 4 * sizeof(typename mapper<typename Arg::Float>::type) : 0;
+    }
+
+  public:
+    NdegTwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                  const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo), shared(arg.asymmetric || !arg.dagger)
+    {
+      if (shared) TunableKernel3D::resizeStep(2, 1); // this will force flavor to be contained in the block
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
+      if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
+      if (arg.nParity != 1) errorQuda("Preconditioned non-degenerate twisted-mass operator not defined nParity=%d", arg.nParity);
+
+      if (arg.dagger) {
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, 1, true, xpay_<Arg::asymmetric>()>(tp, stream);
+        else
+          Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
+      } else {
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), xpay_<Arg::asymmetric>()>(tp, stream);
+        else
+          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), false>(tp, stream);
+      }
+    }
+
+    void initTuneParam(TuneParam &param) const
+    {
+      Dslash::initTuneParam(param);
+      if (shared) param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
+    }
+
+    void defaultTuneParam(TuneParam &param) const
+    {
+      Dslash::defaultTuneParam(param);
+      if (shared) param.shared_bytes = sharedBytesPerThread() * param.block.x * param.block.y * param.block.z;
+    }
+
+    long long flops() const
+    {
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += 2 * in.Ncolor() * 4 * 4 * halo.Volume(); // complex * Nc * Ns * fma * vol
+        break;
+      default: break; // twisted-mass flops are in the interior kernel
+      }
+      return flops;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct NdegTwistedMassPreconditionedApply {
+    template <bool distance_pc>
+    NdegTwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
+                                       double c, bool xpay, int parity, bool dagger, bool asymmetric,
+                                       const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      if (asymmetric) {
+        NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, a, b, c, xpay, x, parity,
+                                                                        dagger, comm_override);
+        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      } else {
+        NdegTwistedMassArg<Float, nColor, nDim, DDArg, recon, false> arg(out, in, halo, U, a, b, c, xpay, x, parity,
+                                                                         dagger, comm_override);
+        NdegTwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      }
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_ndeg_twisted_mass_preconditioned.in.cu b/lib/dslash_ndeg_twisted_mass_preconditioned.in.cu
new file mode 100644
index 0000000000..c82269d822
--- /dev/null
+++ b/lib/dslash_ndeg_twisted_mass_preconditioned.in.cu
@@ -0,0 +1,23 @@
+#include <dslash_ndeg_twisted_mass_preconditioned.hpp>
+
+namespace quda
+{
+
+
+    constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+    constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+    constexpr int reconI = @QUDA_DSLASH_RECONI@;
+    constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+    typedef @QUDA_DSLASH_DDARG@ DDArg;
+    typedef precision_type_mapper<precision>::type Float;
+
+
+    template struct NdegTwistedMassPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+    
+    template NdegTwistedMassPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::NdegTwistedMassPreconditionedApply(
+		    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                    cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
+                    double c, bool xpay, int parity, bool dagger, bool asymmetric,
+                    const int *comm_override, DistanceType<distance_pc>,TimeProfile &profile);
+} // namespace quda
diff --git a/lib/dslash_staggered.cu b/lib/dslash_staggered.cu
index 566c1cbcda..33ed8e18fb 100644
--- a/lib/dslash_staggered.cu
+++ b/lib/dslash_staggered.cu
@@ -1,14 +1,7 @@
-#include <dslash.h>
-#include <worker.h>
-#include <dslash_helper.cuh>
-#include <color_spinor_field_order.h>
-#include <gauge_field_order.h>
-#include <color_spinor.h>
-#include <index_helper.cuh>
-#include <gauge_field.h>
+#define SIGNATURE_ONLY
+#include <dslash_staggered.hpp>
+#undef SIGNATURE_ONLY
 
-#include <dslash_policy.hpp>
-#include <kernels/dslash_staggered.cuh>
 
 /**
    This is a staggered Dirac operator
@@ -17,77 +10,13 @@
 namespace quda
 {
 
-  template <typename Arg> class Staggered : public Dslash<staggered, Arg>
-  {
-    using Dslash = Dslash<staggered, Arg>;
-    using Dslash::arg;
-
-  public:
-    Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-              const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      // operator is anti-Hermitian so do not instantiate dagger
-      if (arg.nParity == 1) {
-        if (arg.xpay)
-          Dslash::template instantiate<packStaggeredShmem, 1, false, true>(tp, stream);
-        else
-          Dslash::template instantiate<packStaggeredShmem, 1, false, false>(tp, stream);
-      } else if (arg.nParity == 2) {
-        if (arg.xpay)
-          Dslash::template instantiate<packStaggeredShmem, 2, false, true>(tp, stream);
-        else
-          Dslash::template instantiate<packStaggeredShmem, 2, false, false>(tp, stream);
-      }
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon_u> struct StaggeredApply {
-    StaggeredApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                   cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, int parity, bool dagger,
-                   const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      constexpr bool improved = false;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-
-      if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC
-          || (U.LinkType() == QUDA_GENERAL_LINKS && U.Reconstruct() == QUDA_RECONSTRUCT_NO)) {
-        if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
-          StaggeredArg<Float, nColor, nDim, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
-            out, in, halo, U, U, a, x, parity, dagger, comm_override);
-          Staggered<decltype(arg)> staggered(arg, out, in, halo);
-
-          dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
-        } else {
-          errorQuda("MILC interface has not been built so MILC phase staggered fermions not enabled");
-        }
-      } else if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
-        if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
-          StaggeredArg<Float, nColor, nDim, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
-            out, in, halo, U, U, a, x, parity, dagger, comm_override);
-          Staggered<decltype(arg)> staggered(arg, out, in, halo);
-
-          dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
-        } else {
-          errorQuda("TIFR interface has not been built so TIFR phase taggered fermions not enabled");
-        }
-      }
-    }
-  };
-
   void ApplyStaggered(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, const GaugeField &U,
                       double a, cvector_ref<const ColorSpinorField> &x, int parity, bool dagger,
                       const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_STAGGERED_DSLASH>()) {
-      instantiate<StaggeredApply, ReconstructStaggered>(out, in, x, U, a, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<StaggeredApply, ReconstructStaggered>(out, in, x, U, a, parity, dagger, comm_override, dummy, profile);
     } else {
       errorQuda("Staggered operator has not been built");
     }
diff --git a/lib/dslash_staggered.hpp b/lib/dslash_staggered.hpp
new file mode 100644
index 0000000000..5386ec7dc4
--- /dev/null
+++ b/lib/dslash_staggered.hpp
@@ -0,0 +1,94 @@
+#include <dslash.h>
+#include <worker.h>
+#include <dslash_helper.cuh>
+#include <color_spinor_field_order.h>
+#include <gauge_field_order.h>
+#include <color_spinor.h>
+#include <index_helper.cuh>
+#include <gauge_field.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_staggered.cuh>
+
+/**
+   This is a staggered Dirac operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class Staggered : public Dslash<staggered, Arg>
+  {
+    using Dslash = Dslash<staggered, Arg>;
+    using Dslash::arg;
+
+  public:
+    Staggered(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+              const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      // operator is anti-Hermitian so do not instantiate dagger
+      if (arg.nParity == 1) {
+        if (arg.xpay)
+          Dslash::template instantiate<packStaggeredShmem, 1, false, true>(tp, stream);
+        else
+          Dslash::template instantiate<packStaggeredShmem, 1, false, false>(tp, stream);
+      } else if (arg.nParity == 2) {
+        if (arg.xpay)
+          Dslash::template instantiate<packStaggeredShmem, 2, false, true>(tp, stream);
+        else
+          Dslash::template instantiate<packStaggeredShmem, 2, false, false>(tp, stream);
+      }
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon_u> struct StaggeredApply {
+    template <bool distance_pc>
+    StaggeredApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                   cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, int parity, bool dagger,
+                   const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      constexpr bool improved = false;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+
+      if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_MILC
+          || (U.LinkType() == QUDA_GENERAL_LINKS && U.Reconstruct() == QUDA_RECONSTRUCT_NO)) {
+        if constexpr (is_enabled<QUDA_MILC_GAUGE_ORDER>()) {
+          StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_MILC> arg(
+            out, in, halo, U, U, a, x, parity, dagger, comm_override);
+          Staggered<decltype(arg)> staggered(arg, out, in, halo);
+
+          dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
+        } else {
+          errorQuda("MILC interface has not been built so MILC phase staggered fermions not enabled");
+        }
+      } else if (U.StaggeredPhase() == QUDA_STAGGERED_PHASE_TIFR) {
+        if constexpr (is_enabled<QUDA_TIFR_GAUGE_ORDER>()) {
+          StaggeredArg<Float, nColor, nDim, DDArg, recon_u, QUDA_RECONSTRUCT_NO, improved, QUDA_STAGGERED_PHASE_TIFR> arg(
+            out, in, halo, U, U, a, x, parity, dagger, comm_override);
+          Staggered<decltype(arg)> staggered(arg, out, in, halo);
+
+          dslash::DslashPolicyTune<decltype(staggered)> policy(staggered, in, halo, profile);
+        } else {
+          errorQuda("TIFR interface has not been built so TIFR phase taggered fermions not enabled");
+        }
+      }
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_staggered.in.cu b/lib/dslash_staggered.in.cu
new file mode 100644
index 0000000000..b1c29d2930
--- /dev/null
+++ b/lib/dslash_staggered.in.cu
@@ -0,0 +1,21 @@
+#include <dslash_staggered.hpp>
+
+namespace quda
+{
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+
+  template struct StaggeredApply<Float, nColor, DDArg, ReconstructStaggered::recon[reconI]>;
+    
+  template StaggeredApply<Float, nColor, DDArg, ReconstructStaggered::recon[reconI]>::StaggeredApply(
+		  cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+		  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, int parity, bool dagger,
+                  const int *comm_override, DistanceType<distance_pc>,TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_twisted_clover.cu b/lib/dslash_twisted_clover.cu
index b48e137f05..a00d2ad1e6 100644
--- a/lib/dslash_twisted_clover.cu
+++ b/lib/dslash_twisted_clover.cu
@@ -1,11 +1,7 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <clover_field.h>
-#include <dslash.h>
-#include <worker.h>
+#define SIGNATURE_ONLY
+#include <dslash_twisted_clover.hpp>
+#undef SIGNATURE_ONLY
 
-#include <dslash_policy.hpp>
-#include <kernels/dslash_wilson_clover.cuh>
 
 /**
    This is the basic gauged twisted-clover operator
@@ -14,73 +10,6 @@
 namespace quda
 {
 
-  template <typename Arg> class TwistedClover : public Dslash<wilsonClover, Arg>
-  {
-    using Dslash = Dslash<wilsonClover, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-
-  public:
-    TwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                  const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      if (arg.xpay)
-        this->template instantiate<packShmem, true>(tp, stream);
-      else
-        errorQuda("Twisted-clover operator only defined for xpay=true");
-    }
-
-    long long flops() const
-    {
-      int clover_flops = 504 + 48;
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: flops += clover_flops * halo.Volume(); break;
-      default: break; // all clover flops are in the interior kernel
-      }
-      return flops;
-    }
-
-    long long bytes() const
-    {
-      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-
-      long long bytes = Dslash::bytes();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: bytes += clover_bytes * halo.Volume(); break;
-      default: break;
-      }
-
-      return bytes;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct TwistedCloverApply {
-
-    TwistedCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &C, double a,
-                       double b, int parity, bool dagger, const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      WilsonCloverArg<Float, nColor, nDim, recon, true> arg(out, in, halo, U, C, a, b, x, parity, dagger, comm_override);
-      TwistedClover<decltype(arg)> twisted(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-    }
-  };
-
   // Apply the twisted-mass Dslash operator
   // out(x) = M*in = (A + i*b*gamma_5)*in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
   // Uses the kappa normalization for the Wilson operator, with a = -kappa.
@@ -90,7 +19,8 @@ namespace quda
                           TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
-      instantiate<TwistedCloverApply>(out, in, x, U, C, a, b, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<TwistedCloverApply>(out, in, x, U, C, a, b, parity, dagger, comm_override, dummy, profile);
     } else {
       errorQuda("Twisted-clover operator has not been built");
     }
diff --git a/lib/dslash_twisted_clover.hpp b/lib/dslash_twisted_clover.hpp
new file mode 100644
index 0000000000..b37ac649d7
--- /dev/null
+++ b/lib/dslash_twisted_clover.hpp
@@ -0,0 +1,93 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <clover_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_wilson_clover.cuh>
+
+/**
+   This is the basic gauged twisted-clover operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class TwistedClover : public Dslash<wilsonClover, Arg>
+  {
+    using Dslash = Dslash<wilsonClover, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+
+  public:
+    TwistedClover(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                  const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      if (arg.xpay)
+        Dslash::template instantiate<packShmem, true>(tp, stream);
+      else
+        errorQuda("Twisted-clover operator only defined for xpay=true");
+    }
+
+    long long flops() const
+    {
+      int clover_flops = 504 + 48;
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: flops += clover_flops * halo.Volume(); break;
+      default: break; // all clover flops are in the interior kernel
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+
+      long long bytes = Dslash::bytes();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: bytes += clover_bytes * halo.Volume(); break;
+      default: break;
+      }
+
+      return bytes;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct TwistedCloverApply {
+    template <bool distance_pc>
+    TwistedCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &C, double a,
+                       double b, int parity, bool dagger, const int *comm_override, DistanceType<distance_pc>,TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      WilsonCloverArg<Float, nColor, nDim, DDArg, recon, true,distance_pc> arg(out, in, halo, U, C, a, b, x, parity, dagger,
+                                                                   comm_override);
+      TwistedClover<decltype(arg)> twisted(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_twisted_clover.in.cu b/lib/dslash_twisted_clover.in.cu
new file mode 100644
index 0000000000..ee3b34ee12
--- /dev/null
+++ b/lib/dslash_twisted_clover.in.cu
@@ -0,0 +1,23 @@
+#include <dslash_twisted_clover.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct TwistedCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+  
+  template TwistedCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::TwistedCloverApply(
+		  cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, 
+		  cvector_ref<const ColorSpinorField> &x,const GaugeField &U, 
+		  const CloverField &C, double a, double b, int parity, 
+		  bool dagger, const int *comm_override,
+		  DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_twisted_clover_preconditioned.cu b/lib/dslash_twisted_clover_preconditioned.cu
index 91f7f147f8..d5a3883c88 100644
--- a/lib/dslash_twisted_clover_preconditioned.cu
+++ b/lib/dslash_twisted_clover_preconditioned.cu
@@ -1,11 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <clover_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_twisted_clover_preconditioned.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_twisted_clover_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the preconditioned gauged twisted-mass operator
@@ -14,119 +9,6 @@
 namespace quda
 {
 
-  template <typename Arg> class TwistedCloverPreconditioned : public Dslash<twistedCloverPreconditioned, Arg>
-  {
-    using Dslash = Dslash<twistedCloverPreconditioned, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-
-  public:
-    TwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      // specialize here to constrain the template instantiation
-      if (arg.nParity == 1) {
-        if (arg.xpay) {
-          if (arg.dagger) errorQuda("xpay operator only defined for not dagger");
-          Dslash::template instantiate<packShmem, 1, false, true>(tp, stream);
-        } else {
-          if (arg.dagger)
-            Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
-          else
-            Dslash::template instantiate<packShmem, 1, false, false>(tp, stream);
-        }
-      } else {
-        errorQuda("Preconditioned twisted-clover operator not defined nParity=%d", arg.nParity);
-      }
-    }
-
-    long long flops() const
-    {
-      int clover_flops = 504 + 48;
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T: flops += clover_flops * 2 * halo.GhostFace()[arg.kernel_type]; break;
-      case EXTERIOR_KERNEL_ALL:
-        flops
-          += clover_flops * 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        break;
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += clover_flops * halo.Volume();
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for flops done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        flops -= clover_flops * ghost_sites;
-
-        break;
-      }
-      return flops;
-    }
-
-    long long bytes() const
-    {
-      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-      if (!arg.dynamic_clover) clover_bytes *= 2;
-
-      long long bytes = Dslash::bytes();
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T: bytes += clover_bytes * 2 * halo.GhostFace()[arg.kernel_type]; break;
-      case EXTERIOR_KERNEL_ALL:
-        bytes
-          += clover_bytes * 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        break;
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        bytes += clover_bytes * halo.Volume();
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for bytes done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        bytes -= clover_bytes * ghost_sites;
-
-        break;
-      }
-
-      return bytes;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct TwistedCloverPreconditionedApply {
-
-    TwistedCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &C,
-                                     double a, double b, bool xpay, int parity, bool dagger, const int *comm_override,
-                                     TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      TwistedCloverArg<Float, nColor, nDim, recon> arg(out, in, halo, U, C, a, b, xpay, x, parity, dagger, comm_override);
-      TwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-    }
-  };
-
   /*
     Apply the preconditioned twisted-mass Dslash operator
 
@@ -138,7 +20,8 @@ namespace quda
                                         const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_TWISTED_CLOVER_DSLASH>()) {
-      instantiate<TwistedCloverPreconditionedApply>(out, in, x, U, C, a, b, xpay, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<TwistedCloverPreconditionedApply>(out, in, x, U, C, a, b, xpay, parity, dagger, comm_override, dummy, profile);
     } else {
       errorQuda("Twisted-clover operator has not been built");
     }
diff --git a/lib/dslash_twisted_clover_preconditioned.hpp b/lib/dslash_twisted_clover_preconditioned.hpp
new file mode 100644
index 0000000000..b6fbaec92a
--- /dev/null
+++ b/lib/dslash_twisted_clover_preconditioned.hpp
@@ -0,0 +1,142 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <clover_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_twisted_clover_preconditioned.cuh>
+
+/**
+   This is the preconditioned gauged twisted-mass operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class TwistedCloverPreconditioned : public Dslash<twistedCloverPreconditioned, Arg>
+  {
+    using Dslash = Dslash<twistedCloverPreconditioned, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+
+  public:
+    TwistedCloverPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      // specialize here to constrain the template instantiation
+      if (arg.nParity == 1) {
+        if (arg.xpay) {
+          if (arg.dagger) errorQuda("xpay operator only defined for not dagger");
+          Dslash::template instantiate<packShmem, 1, false, true>(tp, stream);
+        } else {
+          if (arg.dagger)
+            Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
+          else
+            Dslash::template instantiate<packShmem, 1, false, false>(tp, stream);
+        }
+      } else {
+        errorQuda("Preconditioned twisted-clover operator not defined nParity=%d", arg.nParity);
+      }
+    }
+
+    long long flops() const
+    {
+      int clover_flops = 504 + 48;
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T: flops += clover_flops * 2 * halo.GhostFace()[arg.kernel_type]; break;
+      case EXTERIOR_KERNEL_ALL:
+        flops
+          += clover_flops * 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        break;
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += clover_flops * halo.Volume();
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for flops done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        flops -= clover_flops * ghost_sites;
+
+        break;
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+      if (!arg.dynamic_clover) clover_bytes *= 2;
+
+      long long bytes = Dslash::bytes();
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T: bytes += clover_bytes * 2 * halo.GhostFace()[arg.kernel_type]; break;
+      case EXTERIOR_KERNEL_ALL:
+        bytes
+          += clover_bytes * 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        break;
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        bytes += clover_bytes * halo.Volume();
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for bytes done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        bytes -= clover_bytes * ghost_sites;
+
+        break;
+      }
+
+      return bytes;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct TwistedCloverPreconditionedApply {
+    template <bool distance_pc>
+    TwistedCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &C,
+                                     double a, double b, bool xpay, int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>,
+                                     TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      TwistedCloverArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, C, a, b, xpay, x, parity, dagger,
+                                                              comm_override);
+      TwistedCloverPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_twisted_clover_preconditioned.in.cu b/lib/dslash_twisted_clover_preconditioned.in.cu
new file mode 100644
index 0000000000..f59075839d
--- /dev/null
+++ b/lib/dslash_twisted_clover_preconditioned.in.cu
@@ -0,0 +1,24 @@
+#include <dslash_twisted_clover_preconditioned.hpp>
+
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct TwistedCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template TwistedCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::TwistedCloverPreconditionedApply(
+                  cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                  cvector_ref<const ColorSpinorField> &x,const GaugeField &U,
+                  const CloverField &C, double a, double b,bool xpay, int parity,
+                  bool dagger, const int *comm_override,
+                  DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_twisted_mass.cu b/lib/dslash_twisted_mass.cu
index cc3d2c256a..6194a52c63 100644
--- a/lib/dslash_twisted_mass.cu
+++ b/lib/dslash_twisted_mass.cu
@@ -1,69 +1,10 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_twisted_mass.cuh>
-
-/**
-   This is the basic gauged twisted-mass operator
-*/
+#define SIGNATURE_ONLY
+#include <dslash_twisted_mass.hpp>
+#undef SIGNATURE_ONLY
 
 namespace quda
 {
 
-  template <typename Arg> class TwistedMass : public Dslash<twistedMass, Arg>
-  {
-    using Dslash = Dslash<twistedMass, Arg>;
-    using Dslash::arg;
-    using Dslash::in;
-
-  public:
-    TwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      if (arg.xpay)
-        Dslash::template instantiate<packShmem, true>(tp, stream);
-      else
-        errorQuda("Twisted-mass operator only defined for xpay=true");
-    }
-
-    long long flops() const
-    {
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += in.size() * 2 * in.Ncolor() * 4 * 2 * in.Volume(); // complex * Nc * Ns * fma * vol
-        break;
-      default: break; // twisted-mass flops are in the interior kernel
-      }
-      return flops;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct TwistedMassApply {
-
-    TwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, int parity,
-                     bool dagger, const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      TwistedMassArg<Float, nColor, nDim, recon> arg(out, in, halo, U, a, b, x, parity, dagger, comm_override);
-      TwistedMass<decltype(arg)> twisted(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-    }
-  };
 
   // Apply the twisted-mass Dslash operator
   // out(x) = M*in = (1 + i*b*gamma_5)*in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu)
@@ -73,7 +14,8 @@ namespace quda
                         bool dagger, const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
-      instantiate<TwistedMassApply>(out, in, x, U, a, b, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<TwistedMassApply>(out, in, x, U, a, b, parity, dagger, comm_override, dummy, profile);
     } else {
       errorQuda("Twisted-mass operator has not been built");
     }
diff --git a/lib/dslash_twisted_mass.hpp b/lib/dslash_twisted_mass.hpp
new file mode 100644
index 0000000000..ff8d210a7b
--- /dev/null
+++ b/lib/dslash_twisted_mass.hpp
@@ -0,0 +1,74 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_twisted_mass.cuh>
+
+/**
+   This is the basic gauged twisted-mass operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class TwistedMass : public Dslash<twistedMass, Arg>
+  {
+    using Dslash = Dslash<twistedMass, Arg>;
+    using Dslash::arg;
+    using Dslash::in;
+
+  public:
+    TwistedMass(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      if (arg.xpay)
+        Dslash::template instantiate<packShmem, true>(tp, stream);
+      else
+        errorQuda("Twisted-mass operator only defined for xpay=true");
+    }
+
+    long long flops() const
+    {
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += in.size() * 2 * in.Ncolor() * 4 * 2 * in.Volume(); // complex * Nc * Ns * fma * vol
+        break;
+      default: break; // twisted-mass flops are in the interior kernel
+      }
+      return flops;
+    }
+  };
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct TwistedMassApply {
+    template <bool distance_pc>
+    TwistedMassApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, int parity,
+                     bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      TwistedMassArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, a, b, x, parity, dagger, comm_override);
+      TwistedMass<decltype(arg)> twisted(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_twisted_mass.in.cu b/lib/dslash_twisted_mass.in.cu
new file mode 100644
index 0000000000..74df605e02
--- /dev/null
+++ b/lib/dslash_twisted_mass.in.cu
@@ -0,0 +1,20 @@
+#include <dslash_twisted_mass.hpp>
+
+namespace quda
+{
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  template struct TwistedMassApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>; 
+
+  template TwistedMassApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::TwistedMassApply(
+		  cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b, int parity,
+                  bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_twisted_mass_preconditioned.cu b/lib/dslash_twisted_mass_preconditioned.cu
index 43addab081..00ee3aed24 100644
--- a/lib/dslash_twisted_mass_preconditioned.cu
+++ b/lib/dslash_twisted_mass_preconditioned.cu
@@ -1,10 +1,6 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_twisted_mass_preconditioned.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_twisted_mass_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the preconditioned gauged twisted-mass operator
@@ -13,88 +9,6 @@
 namespace quda
 {
 
-  // trait to ensure we don't instantiate asymmetric & xpay
-  template <bool symmetric> constexpr bool xpay_() { return true; }
-  template <> constexpr bool xpay_<true>() { return false; }
-
-  // trait to ensure we don't instantiate asymmetric & !dagger
-  template <bool symmetric> constexpr bool not_dagger_() { return false; }
-  template <> constexpr bool not_dagger_<true>() { return true; }
-
-  template <typename Arg> class TwistedMassPreconditioned : public Dslash<twistedMassPreconditioned, Arg>
-  {
-    using Dslash = Dslash<twistedMassPreconditioned, Arg>;
-    using Dslash::arg;
-    using Dslash::in;
-
-  public:
-    TwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                              const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
-      if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
-      if (arg.nParity != 1) errorQuda("Preconditioned twisted-mass operator not defined nParity=%d", arg.nParity);
-
-      if (arg.dagger) {
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, 1, true, xpay_<Arg::asymmetric>()>(tp, stream);
-        else
-          Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
-      } else {
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), xpay_<Arg::asymmetric>()>(tp, stream);
-        else
-          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), false>(tp, stream);
-      }
-    }
-
-    long long flops() const
-    {
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += in.size() * 2 * in.Ncolor() * 4 * 2 * in.Volume(); // complex * Nc * Ns * fma * vol
-        break;
-      default: break;
-      }
-      return flops;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct TwistedMassPreconditionedApply {
-
-    TwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                   cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
-                                   bool xpay, int parity, bool dagger, bool asymmetric, const int *comm_override,
-                                   TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      if (asymmetric) {
-        TwistedMassArg<Float, nColor, nDim, recon, true> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
-                                                             comm_override);
-        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
-
-        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-      } else {
-        TwistedMassArg<Float, nColor, nDim, recon, false> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
-                                                              comm_override);
-        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
-
-        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
-      }
-    }
-  };
-
   /*
     Apply the preconditioned twisted-mass Dslash operator
 
@@ -105,8 +19,9 @@ namespace quda
                                       cvector_ref<const ColorSpinorField> &x, int parity, bool dagger, bool asymmetric,
                                       const int *comm_override, TimeProfile &profile)
   {
+    auto dummy = DistanceType<false>();
     if constexpr (is_enabled<QUDA_TWISTED_MASS_DSLASH>()) {
-      instantiate<TwistedMassPreconditionedApply>(out, in, x, U, a, b, xpay, parity, dagger, asymmetric, comm_override,
+      instantiate<TwistedMassPreconditionedApply>(out, in, x, U, a, b, xpay, parity, dagger, asymmetric, comm_override, dummy,
                                                   profile);
     } else {
       errorQuda("Twisted-mass operator has not been built");
diff --git a/lib/dslash_twisted_mass_preconditioned.hpp b/lib/dslash_twisted_mass_preconditioned.hpp
new file mode 100644
index 0000000000..a354c5113c
--- /dev/null
+++ b/lib/dslash_twisted_mass_preconditioned.hpp
@@ -0,0 +1,106 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_twisted_mass_preconditioned.cuh>
+
+/**
+   This is the preconditioned gauged twisted-mass operator
+*/
+
+namespace quda
+{
+
+  // trait to ensure we don't instantiate asymmetric & xpay
+  template <bool symmetric> constexpr bool xpay_() { return true; }
+  template <> constexpr bool xpay_<true>() { return false; }
+
+  // trait to ensure we don't instantiate asymmetric & !dagger
+  template <bool symmetric> constexpr bool not_dagger_() { return false; }
+  template <> constexpr bool not_dagger_<true>() { return true; }
+
+  template <typename Arg> class TwistedMassPreconditioned : public Dslash<twistedMassPreconditioned, Arg>
+  {
+    using Dslash = Dslash<twistedMassPreconditioned, Arg>;
+    using Dslash::arg;
+    using Dslash::in;
+
+  public:
+    TwistedMassPreconditioned(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                              const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      if (arg.asymmetric && !arg.dagger) errorQuda("asymmetric operator only defined for dagger");
+      if (arg.asymmetric && arg.xpay) errorQuda("asymmetric operator not defined for xpay");
+      if (arg.nParity != 1) errorQuda("Preconditioned twisted-mass operator not defined nParity=%d", arg.nParity);
+
+      if (arg.dagger) {
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, 1, true, xpay_<Arg::asymmetric>()>(tp, stream);
+        else
+          Dslash::template instantiate<packShmem, 1, true, false>(tp, stream);
+      } else {
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), xpay_<Arg::asymmetric>()>(tp, stream);
+        else
+          Dslash::template instantiate<packShmem, 1, not_dagger_<Arg::asymmetric>(), false>(tp, stream);
+      }
+    }
+
+    long long flops() const
+    {
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += in.size() * 2 * in.Ncolor() * 4 * 2 * in.Volume(); // complex * Nc * Ns * fma * vol
+        break;
+      default: break;
+      }
+      return flops;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct TwistedMassPreconditionedApply {
+    template <bool distance_pc>
+    TwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                   cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
+                                   bool xpay, int parity, bool dagger, bool asymmetric, const int *comm_override,
+                                   DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+      ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      if (asymmetric) {
+        TwistedMassArg<Float, nColor, nDim, DDArg, recon, true> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
+                                                                    comm_override);
+        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+
+        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      } else {
+        TwistedMassArg<Float, nColor, nDim, DDArg, recon, false> arg(out, in, halo, U, a, b, xpay, x, parity, dagger,
+                                                                     comm_override);
+        TwistedMassPreconditioned<decltype(arg)> twisted(arg, out, in, halo);
+
+        dslash::DslashPolicyTune<decltype(twisted)> policy(twisted, in, halo, profile);
+      }
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_twisted_mass_preconditioned.in.cu b/lib/dslash_twisted_mass_preconditioned.in.cu
new file mode 100644
index 0000000000..cbcec90d05
--- /dev/null
+++ b/lib/dslash_twisted_mass_preconditioned.in.cu
@@ -0,0 +1,21 @@
+#include <dslash_twisted_mass_preconditioned.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+	
+
+  template struct TwistedMassPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template TwistedMassPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::TwistedMassPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                   cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double b,
+                                   bool xpay, int parity, bool dagger, bool asymmetric, const int *comm_override,
+                                   DistanceType<distance_pc>,TimeProfile &profile);
+} // namespace quda
diff --git a/lib/dslash_wilson.cu b/lib/dslash_wilson.cu
index 0dc7f8ebb1..414cb19495 100644
--- a/lib/dslash_wilson.cu
+++ b/lib/dslash_wilson.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_wilson.hpp>
+#undef SIGNATURE_ONLY
 
 namespace quda
 {
diff --git a/lib/dslash_wilson.hpp b/lib/dslash_wilson.hpp
index fa5a9400b4..78a864f640 100644
--- a/lib/dslash_wilson.hpp
+++ b/lib/dslash_wilson.hpp
@@ -37,20 +37,24 @@ namespace quda
   template <bool distance_pc> struct DistanceType {
   };
 
-  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonApply {
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonApply {
 
     template <bool distance_pc>
     WilsonApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                 cvector_ref<const ColorSpinorField> &x, const GaugeField &U, double a, double alpha0, int t0,
                 int parity, bool dagger, const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY //Used to hide from the compiler the implementation of the function
+      ;
+#else
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
-      WilsonArg<Float, nColor, nDim, recon, distance_pc> arg(out, in, halo, U, a, x, parity, dagger, comm_override,
-                                                             alpha0, t0);
+      WilsonArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, a, x, parity, dagger,
+                                                                    comm_override, alpha0, t0);
       Wilson<decltype(arg)> wilson(arg, out, in, halo);
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
     }
+#endif
   };
 
 } // namespace quda
diff --git a/lib/dslash_wilson.in.cu b/lib/dslash_wilson.in.cu
new file mode 100644
index 0000000000..4ac0e08e09
--- /dev/null
+++ b/lib/dslash_wilson.in.cu
@@ -0,0 +1,21 @@
+#include <dslash_wilson.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct WilsonApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template WilsonApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonApply(
+    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
+    const GaugeField &U, double a, double alpha0, int t0, int parity, bool dagger, const int *comm_override,
+    DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_wilson_clover.cu b/lib/dslash_wilson_clover.cu
index 96413eeba8..9992b64d27 100644
--- a/lib/dslash_wilson_clover.cu
+++ b/lib/dslash_wilson_clover.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_wilson_clover.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the Wilson-clover linear operator
diff --git a/lib/dslash_wilson_clover.hpp b/lib/dslash_wilson_clover.hpp
index 4e40d583c9..59ae5b6a96 100644
--- a/lib/dslash_wilson_clover.hpp
+++ b/lib/dslash_wilson_clover.hpp
@@ -71,22 +71,25 @@ namespace quda
   template <bool distance_pc> struct DistanceType {
   };
 
-  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverApply {
-
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct WilsonCloverApply {
     template <bool distance_pc>
     WilsonCloverApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                       cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
                       double alpha0, int t0, int parity, bool dagger, const int *comm_override,
                       DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
-      WilsonCloverArg<Float, nColor, nDim, recon, false, distance_pc> arg(out, in, halo, U, A, a, 0.0, x, parity,
-                                                                          dagger, comm_override, alpha0, t0);
+      WilsonCloverArg<Float, nColor, nDim, DDArg, recon, false, distance_pc> arg(out, in, halo, U, A, a, 0.0, x, parity,
+                                                                                 dagger, comm_override, alpha0, t0);
       WilsonClover<decltype(arg)> wilson(arg, out, in, halo, A);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
     }
+#endif
   };
 
 } // namespace quda
diff --git a/lib/dslash_wilson_clover.in.cu b/lib/dslash_wilson_clover.in.cu
new file mode 100644
index 0000000000..40cb2ac36b
--- /dev/null
+++ b/lib/dslash_wilson_clover.in.cu
@@ -0,0 +1,26 @@
+#include <dslash_wilson_clover.hpp>
+
+/**
+   This is the Wilson-clover linear operator
+*/
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct WilsonCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template WilsonCloverApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverApply(
+			  cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+			  cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
+			  double alpha0, int t0, int parity, bool dagger, const int *comm_override,
+			  DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_wilson_clover_distance.cu b/lib/dslash_wilson_clover_distance.cu
index 8ff71313da..f578143906 100644
--- a/lib/dslash_wilson_clover_distance.cu
+++ b/lib/dslash_wilson_clover_distance.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_wilson_clover.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the Wilson-clover linear operator
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist.cu b/lib/dslash_wilson_clover_hasenbusch_twist.cu
index 3d09cc5af6..f600980588 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist.cu
+++ b/lib/dslash_wilson_clover_hasenbusch_twist.cu
@@ -1,93 +1,10 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <clover_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_wilson_clover_hasenbusch_twist.cuh>
-
-/**
-   This is the Wilson-clover linear operator
-*/
+#define SIGNATURE_ONLY
+#include <dslash_wilson_clover_hasenbusch_twist.hpp>
+#undef SIGNATURE_ONLY
 
 namespace quda
 {
 
-  template <typename Arg> class WilsonCloverHasenbuschTwist : public Dslash<cloverHasenbusch, Arg>
-  {
-    using Dslash = Dslash<cloverHasenbusch, Arg>;
-    using Dslash::arg;
-    using Dslash::in;
-
-  public:
-    WilsonCloverHasenbuschTwist(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-      if (arg.xpay)
-        Dslash::template instantiate<packShmem, true>(tp, stream);
-      else
-        errorQuda("Wilson-clover - Hasenbusch Twist operator only defined for xpay=true");
-    }
-
-    long long flops() const
-    {
-      int clover_flops = in.size() * 504;
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += clover_flops * in.Volume();
-
-        // -mu * (i gamma_5 A) (A x)
-        flops += ((clover_flops + 48) * in.Volume());
-
-        break;
-      default: break; // all clover flops are in the interior kernel
-      }
-      return flops;
-    }
-
-    long long bytes() const
-    {
-      int clover_bytes = in.size() * 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-
-      long long bytes = Dslash::bytes();
-      switch (arg.kernel_type) {
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: bytes += clover_bytes * in.Volume(); break;
-      default: break;
-      }
-
-      return bytes;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverHasenbuschTwistApply {
-
-    WilsonCloverHasenbuschTwistApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A,
-                                     double a, double b, int parity, bool dagger, const int *comm_override,
-                                     TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      WilsonCloverHasenbuschTwistArg<Float, nColor, nDim, recon> arg(out, in, halo, U, A, a, b, x, parity, dagger,
-                                                                     comm_override);
-      WilsonCloverHasenbuschTwist<decltype(arg)> wilson(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
-    }
-  };
-
   // Apply the Wilson-clover operator
   // out(x) = M*in = (A(x) + a * \sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
   // Uses the kappa normalization for the Wilson operator.
@@ -97,7 +14,9 @@ namespace quda
                                         const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
-      instantiate<WilsonCloverHasenbuschTwistApply>(out, in, x, U, A, a, b, parity, dagger, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<WilsonCloverHasenbuschTwistApply>(out, in, x, U, A, a, b, parity, dagger, comm_override, dummy,
+                                                    profile);
     } else {
       errorQuda("Clover Hasensbuch Twist operator has not been built");
     }
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist.hpp b/lib/dslash_wilson_clover_hasenbusch_twist.hpp
new file mode 100644
index 0000000000..6991048398
--- /dev/null
+++ b/lib/dslash_wilson_clover_hasenbusch_twist.hpp
@@ -0,0 +1,99 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <clover_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_wilson_clover_hasenbusch_twist.cuh>
+
+/**
+   This is the Wilson-clover linear operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class WilsonCloverHasenbuschTwist : public Dslash<cloverHasenbusch, Arg>
+  {
+    using Dslash = Dslash<cloverHasenbusch, Arg>;
+    using Dslash::arg;
+    using Dslash::in;
+
+  public:
+    WilsonCloverHasenbuschTwist(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+      if (arg.xpay)
+        Dslash::template instantiate<packShmem, true>(tp, stream);
+      else
+        errorQuda("Wilson-clover - Hasenbusch Twist operator only defined for xpay=true");
+    }
+
+    long long flops() const
+    {
+      int clover_flops = in.size() * 504;
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += clover_flops * in.Volume();
+
+        // -mu * (i gamma_5 A) (A x)
+        flops += ((clover_flops + 48) * in.Volume());
+
+        break;
+      default: break; // all clover flops are in the interior kernel
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      int clover_bytes = in.size() * 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+
+      long long bytes = Dslash::bytes();
+      switch (arg.kernel_type) {
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: bytes += clover_bytes * in.Volume(); break;
+      default: break;
+      }
+
+      return bytes;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct WilsonCloverHasenbuschTwistApply {
+    template <bool distance_pc>
+    WilsonCloverHasenbuschTwistApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A,
+                                     double a, double b, int parity, bool dagger, const int *comm_override,DistanceType<distance_pc>,
+                                     TimeProfile &profile)
+#ifdef SIGNATURE_ONLY // Used to hide from the compiler the implementation of the function
+      ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      WilsonCloverHasenbuschTwistArg<Float, nColor, nDim, DDArg, recon> arg(out, in, halo, U, A, a, b, x, parity,
+                                                                            dagger, comm_override);
+      WilsonCloverHasenbuschTwist<decltype(arg)> wilson(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist.in.cu b/lib/dslash_wilson_clover_hasenbusch_twist.in.cu
new file mode 100644
index 0000000000..08840c0e82
--- /dev/null
+++ b/lib/dslash_wilson_clover_hasenbusch_twist.in.cu
@@ -0,0 +1,21 @@
+#include <dslash_wilson_clover_hasenbusch_twist.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct WilsonCloverHasenbuschTwistApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template WilsonCloverHasenbuschTwistApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverHasenbuschTwistApply(
+    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
+    const GaugeField &U, const CloverField &A, double a, double b, int parity, bool dagger, const int *comm_override,
+    DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
index e0938ebe7a..67dab8b2a8 100644
--- a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.cu
@@ -1,140 +1,10 @@
-#include <gauge_field.h>
-#include <color_spinor_field.h>
-#include <clover_field.h>
-#include <dslash.h>
-#include <worker.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh>
+#define SIGNATURE_ONLY
+#include <dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 namespace quda
 {
 
-  /* ***************************
-   * No Clov Inv:  1 - k^2 D - i mu gamma_5 A
-   * **************************/
-  template <typename Arg>
-  class WilsonCloverHasenbuschTwistPCNoClovInv : public Dslash<cloverHasenbuschPreconditioned, Arg>
-  {
-    using Dslash = Dslash<cloverHasenbuschPreconditioned, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-
-  public:
-    WilsonCloverHasenbuschTwistPCNoClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                           cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-
-      // specialize here to constrain the template instantiation
-      if (arg.nParity == 1) {
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, 1, true>(tp, stream);
-        else
-          errorQuda("Operator only defined for xpay=true");
-      } else {
-        errorQuda("Operator not defined nParity=%d", arg.nParity);
-      }
-    }
-
-    long long flops() const
-    {
-      int clover_flops = 504;
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T:
-        // 2 from fwd / back face * 1 clover terms:
-        // there is no A^{-1}D only D
-        // there is one clover_term and 48 is the - mu (igamma_5) A
-        flops += 2 * (clover_flops + 48) * halo.GhostFace()[arg.kernel_type];
-        break;
-      case EXTERIOR_KERNEL_ALL:
-        flops += 2 * (clover_flops + 48)
-          * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        break;
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += (clover_flops + 48) * halo.Volume();
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for flops done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        flops -= (clover_flops + 48) * ghost_sites;
-
-        break;
-      }
-      return flops;
-    }
-
-    long long bytes() const
-    {
-      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-
-      long long bytes = Dslash::bytes();
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T:
-        // Factor of 2 is from the fwd/back faces.
-        bytes += clover_bytes * 2 * halo.GhostFace()[arg.kernel_type];
-        break;
-      case EXTERIOR_KERNEL_ALL:
-        // Factor of 2 is from the fwd/back faces
-        bytes
-          += clover_bytes * 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        break;
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-
-        bytes += clover_bytes * halo.Volume();
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for bytes done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        bytes -= clover_bytes * ghost_sites;
-
-        break;
-      }
-
-      return bytes;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverHasenbuschTwistPCNoClovInvApply {
-
-    WilsonCloverHasenbuschTwistPCNoClovInvApply(cvector_ref<ColorSpinorField> &out,
-                                                cvector_ref<const ColorSpinorField> &in,
-                                                cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
-                                                const CloverField &A, double a, double b, int parity, bool dagger,
-                                                const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, recon, false>;
-      ArgType arg(out, in, halo, U, A, a, b, x, parity, dagger, comm_override);
-      WilsonCloverHasenbuschTwistPCNoClovInv<ArgType> wilson(arg, out, in, halo);
-
-      dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
-    }
-  };
-
   // Apply the Wilson-clover operator
   // out(x) = M*in = (A(x) + kappa * \sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
   // Uses the kappa normalization for the Wilson operator.
@@ -145,141 +15,14 @@ namespace quda
                                                    const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
-      instantiate<WilsonCloverHasenbuschTwistPCNoClovInvApply>(out, in, x, U, A, a, b, parity, dagger, comm_override,
+      auto dummy = DistanceType<false>();
+      instantiate<WilsonCloverHasenbuschTwistPCNoClovInvApply>(out, in, x, U, A, a, b, parity, dagger, comm_override, dummy,
                                                                profile);
     } else {
       errorQuda("Clover Hasenbusch Twist operator has not been built");
     }
   }
 
-  /* ***************************
-   * Clov Inv
-   *
-   * M = psi_p - k^2 A^{-1} D_p\not{p} - i mu gamma_5 A_{pp} psi_{p}
-   * **************************/
-  template <typename Arg>
-  class WilsonCloverHasenbuschTwistPCClovInv : public Dslash<cloverHasenbuschPreconditioned, Arg>
-  {
-    using Dslash = Dslash<cloverHasenbuschPreconditioned, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-
-  public:
-    WilsonCloverHasenbuschTwistPCClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
-                                         cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream)
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-
-      // specialize here to constrain the template instantiation
-      if (arg.nParity == 1) {
-        if (arg.xpay)
-          Dslash::template instantiate<packShmem, 1, true>(tp, stream);
-        else
-          errorQuda("Operator only defined for xpay=true");
-      } else {
-        errorQuda("Operator not defined nParity=%d", arg.nParity);
-      }
-    }
-
-    long long flops() const
-    {
-      int clover_flops = 504;
-      long long flops = Dslash::flops();
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T:
-        // 2 from fwd / back face * 2 clover terms:
-        // one clover_term from the A^{-1}D
-        // second clover_term and 48 is the - mu (igamma_5) A
-        flops += 2 * (2 * clover_flops + 48) * halo.GhostFace()[arg.kernel_type];
-        break;
-      case EXTERIOR_KERNEL_ALL:
-        flops += 2 * (2 * clover_flops + 48)
-          * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        break;
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-        flops += (2 * clover_flops + 48) * halo.Volume();
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for flops done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        flops -= (2 * clover_flops + 48) * ghost_sites;
-
-        break;
-      }
-      return flops;
-    }
-
-    long long bytes() const
-    {
-      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
-
-      // if we use dynamic clover we read only A (even for A^{-1}
-      // otherwise we read both A and A^{-1}
-      int dyn_factor = arg.dynamic_clover ? 1 : 2;
-
-      long long bytes = Dslash::bytes();
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T:
-        // Factor of 2 is from the fwd/back faces.
-        bytes += dyn_factor * clover_bytes * 2 * halo.GhostFace()[arg.kernel_type];
-        break;
-      case EXTERIOR_KERNEL_ALL:
-        // Factor of 2 is from the fwd/back faces
-        bytes += dyn_factor * clover_bytes * 2
-          * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        break;
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY:
-
-        bytes += dyn_factor * clover_bytes * halo.Volume();
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for bytes done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        bytes -= dyn_factor * clover_bytes * ghost_sites;
-
-        break;
-      }
-
-      return bytes;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverHasenbuschTwistPCClovInvApply {
-
-    WilsonCloverHasenbuschTwistPCClovInvApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
-                                              const CloverField &A, double kappa, double mu, int parity, bool dagger,
-                                              const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in);
-      using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, recon, true>;
-      ArgType arg(out, in, halo, U, A, kappa, mu, x, parity, dagger, comm_override);
-      WilsonCloverHasenbuschTwistPCClovInv<ArgType> wilson(arg, out, in, halo);
-      dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
-    }
-  };
 
   // Apply the Wilson-clover operator
   // out(x) = M*in = (A(x) + kappa * \sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu))
@@ -291,7 +34,8 @@ namespace quda
                                                  const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH>()) {
-      instantiate<WilsonCloverHasenbuschTwistPCClovInvApply>(out, in, x, U, A, a, b, parity, dagger, comm_override,
+      auto dummy = DistanceType<false>();
+      instantiate<WilsonCloverHasenbuschTwistPCClovInvApply>(out, in, x, U, A, a, b, parity, dagger, comm_override, dummy,
                                                              profile);
     } else {
       errorQuda("Clover Hasenbusch Twist operator has not been built");
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
new file mode 100644
index 0000000000..b6e1fcc64d
--- /dev/null
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp
@@ -0,0 +1,282 @@
+#include <gauge_field.h>
+#include <color_spinor_field.h>
+#include <clover_field.h>
+#include <dslash.h>
+#include <worker.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/dslash_wilson_clover_hasenbusch_twist_preconditioned.cuh>
+
+namespace quda
+{
+
+  /* ***************************
+   * No Clov Inv:  1 - k^2 D - i mu gamma_5 A
+   * **************************/
+  template <typename Arg>
+  class WilsonCloverHasenbuschTwistPCNoClovInv : public Dslash<cloverHasenbuschPreconditioned, Arg>
+  {
+    using Dslash = Dslash<cloverHasenbuschPreconditioned, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+
+  public:
+    WilsonCloverHasenbuschTwistPCNoClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
+                                           cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+
+      // specialize here to constrain the template instantiation
+      if (arg.nParity == 1) {
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, 1, true>(tp, stream);
+        else
+          errorQuda("Operator only defined for xpay=true");
+      } else {
+        errorQuda("Operator not defined nParity=%d", arg.nParity);
+      }
+    }
+
+    long long flops() const
+    {
+      int clover_flops = 504;
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T:
+        // 2 from fwd / back face * 1 clover terms:
+        // there is no A^{-1}D only D
+        // there is one clover_term and 48 is the - mu (igamma_5) A
+        flops += 2 * (clover_flops + 48) * halo.GhostFace()[arg.kernel_type];
+        break;
+      case EXTERIOR_KERNEL_ALL:
+        flops += 2 * (clover_flops + 48)
+          * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        break;
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += (clover_flops + 48) * halo.Volume();
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for flops done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        flops -= (clover_flops + 48) * ghost_sites;
+
+        break;
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+
+      long long bytes = Dslash::bytes();
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T:
+        // Factor of 2 is from the fwd/back faces.
+        bytes += clover_bytes * 2 * halo.GhostFace()[arg.kernel_type];
+        break;
+      case EXTERIOR_KERNEL_ALL:
+        // Factor of 2 is from the fwd/back faces
+        bytes
+          += clover_bytes * 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        break;
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+
+        bytes += clover_bytes * halo.Volume();
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for bytes done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        bytes -= clover_bytes * ghost_sites;
+
+        break;
+      }
+
+      return bytes;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct WilsonCloverHasenbuschTwistPCNoClovInvApply {
+    template <bool distance_pc>
+    WilsonCloverHasenbuschTwistPCNoClovInvApply(cvector_ref<ColorSpinorField> &out,
+                                                cvector_ref<const ColorSpinorField> &in,
+                                                cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
+                                                const CloverField &A, double a, double b, int parity, bool dagger,
+                                                const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, DDArg, recon, false>;
+      ArgType arg(out, in, halo, U, A, a, b, x, parity, dagger, comm_override);
+      WilsonCloverHasenbuschTwistPCNoClovInv<ArgType> wilson(arg, out, in, halo);
+
+      dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
+    }
+#endif
+  };
+
+
+  /* ***************************
+   * Clov Inv
+   *
+   * M = psi_p - k^2 A^{-1} D_p\not{p} - i mu gamma_5 A_{pp} psi_{p}
+   * **************************/
+  template <typename Arg>
+  class WilsonCloverHasenbuschTwistPCClovInv : public Dslash<cloverHasenbuschPreconditioned, Arg>
+  {
+    using Dslash = Dslash<cloverHasenbuschPreconditioned, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+
+  public:
+    WilsonCloverHasenbuschTwistPCClovInv(Arg &arg, cvector_ref<ColorSpinorField> &out,
+                                         cvector_ref<const ColorSpinorField> &in, const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream)
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+
+      // specialize here to constrain the template instantiation
+      if (arg.nParity == 1) {
+        if (arg.xpay)
+          Dslash::template instantiate<packShmem, 1, true>(tp, stream);
+        else
+          errorQuda("Operator only defined for xpay=true");
+      } else {
+        errorQuda("Operator not defined nParity=%d", arg.nParity);
+      }
+    }
+
+    long long flops() const
+    {
+      int clover_flops = 504;
+      long long flops = Dslash::flops();
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T:
+        // 2 from fwd / back face * 2 clover terms:
+        // one clover_term from the A^{-1}D
+        // second clover_term and 48 is the - mu (igamma_5) A
+        flops += 2 * (2 * clover_flops + 48) * halo.GhostFace()[arg.kernel_type];
+        break;
+      case EXTERIOR_KERNEL_ALL:
+        flops += 2 * (2 * clover_flops + 48)
+          * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        break;
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+        flops += (2 * clover_flops + 48) * halo.Volume();
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for flops done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        flops -= (2 * clover_flops + 48) * ghost_sites;
+
+        break;
+      }
+      return flops;
+    }
+
+    long long bytes() const
+    {
+      int clover_bytes = 72 * in.Precision() + (isFixed<typename Arg::Float>::value ? 2 * sizeof(float) : 0);
+
+      // if we use dynamic clover we read only A (even for A^{-1}
+      // otherwise we read both A and A^{-1}
+      int dyn_factor = arg.dynamic_clover ? 1 : 2;
+
+      long long bytes = Dslash::bytes();
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T:
+        // Factor of 2 is from the fwd/back faces.
+        bytes += dyn_factor * clover_bytes * 2 * halo.GhostFace()[arg.kernel_type];
+        break;
+      case EXTERIOR_KERNEL_ALL:
+        // Factor of 2 is from the fwd/back faces
+        bytes += dyn_factor * clover_bytes * 2
+          * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        break;
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY:
+
+        bytes += dyn_factor * clover_bytes * halo.Volume();
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for bytes done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        bytes -= dyn_factor * clover_bytes * ghost_sites;
+
+        break;
+      }
+
+      return bytes;
+    }
+  };
+
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct WilsonCloverHasenbuschTwistPCClovInvApply {
+    template <bool distance_pc>
+    WilsonCloverHasenbuschTwistPCClovInvApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
+                                              const CloverField &A, double kappa, double mu, int parity, bool dagger,
+                                              const int *comm_override,DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in);
+      using ArgType = WilsonCloverHasenbuschTwistPCArg<Float, nColor, nDim, DDArg, recon, true>;
+      ArgType arg(out, in, halo, U, A, kappa, mu, x, parity, dagger, comm_override);
+      WilsonCloverHasenbuschTwistPCClovInv<ArgType> wilson(arg, out, in, halo);
+      dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
+    }
+#endif
+  };
+
+} // namespace quda
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_ClovInv.in.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_ClovInv.in.cu
new file mode 100644
index 0000000000..91e927859f
--- /dev/null
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_ClovInv.in.cu
@@ -0,0 +1,21 @@
+#include <dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+
+  template struct WilsonCloverHasenbuschTwistPCClovInvApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template WilsonCloverHasenbuschTwistPCClovInvApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverHasenbuschTwistPCClovInvApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                                              cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
+                                              const CloverField &A, double kappa, double mu, int parity, bool dagger,
+                                              const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile);
+} // namespace quda
diff --git a/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_NoClovInv.in.cu b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_NoClovInv.in.cu
new file mode 100644
index 0000000000..0079450b3f
--- /dev/null
+++ b/lib/dslash_wilson_clover_hasenbusch_twist_preconditioned_NoClovInv.in.cu
@@ -0,0 +1,24 @@
+#include <dslash_wilson_clover_hasenbusch_twist_preconditioned.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+
+
+  template struct WilsonCloverHasenbuschTwistPCNoClovInvApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+  
+  template WilsonCloverHasenbuschTwistPCNoClovInvApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverHasenbuschTwistPCNoClovInvApply(cvector_ref<ColorSpinorField> &out,
+                                                cvector_ref<const ColorSpinorField> &in,
+                                                cvector_ref<const ColorSpinorField> &x, const GaugeField &U,
+                                                const CloverField &A, double a, double b, int parity, bool dagger,
+                                                const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/dslash_wilson_clover_preconditioned.cu b/lib/dslash_wilson_clover_preconditioned.cu
index 6102d9c2e2..237e4b07f0 100644
--- a/lib/dslash_wilson_clover_preconditioned.cu
+++ b/lib/dslash_wilson_clover_preconditioned.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_wilson_clover_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the Wilson-clover preconditioned linear operator
diff --git a/lib/dslash_wilson_clover_preconditioned.hpp b/lib/dslash_wilson_clover_preconditioned.hpp
index a144b64e62..0ca37f886c 100644
--- a/lib/dslash_wilson_clover_preconditioned.hpp
+++ b/lib/dslash_wilson_clover_preconditioned.hpp
@@ -116,22 +116,27 @@ namespace quda
   template <bool distance_pc> struct DistanceType {
   };
 
-  template <typename Float, int nColor, QudaReconstructType recon> struct WilsonCloverPreconditionedApply {
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon>
+  struct WilsonCloverPreconditionedApply {
 
     template <bool distance_pc>
     WilsonCloverPreconditionedApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                                     cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A,
                                     double a, double alpha0, int t0, int parity, bool dagger, const int *comm_override,
                                     DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY
+    ;
+#else
     {
       constexpr int nDim = 4;
       auto halo = ColorSpinorField::create_comms_batch(in);
-      WilsonCloverArg<Float, nColor, nDim, recon, distance_pc> arg(out, in, halo, U, A, a, x, parity, dagger,
-                                                                   comm_override, alpha0, t0);
+      WilsonCloverArg<Float, nColor, nDim, DDArg, recon, distance_pc> arg(out, in, halo, U, A, a, x, parity, dagger,
+                                                                          comm_override, alpha0, t0);
       WilsonCloverPreconditioned<decltype(arg)> wilson(arg, out, in, halo, A);
 
       dslash::DslashPolicyTune<decltype(wilson)> policy(wilson, in, halo, profile);
     }
+#endif
   };
 
 } // namespace quda
diff --git a/lib/dslash_wilson_clover_preconditioned.in.cu b/lib/dslash_wilson_clover_preconditioned.in.cu
new file mode 100644
index 0000000000..549522f8d4
--- /dev/null
+++ b/lib/dslash_wilson_clover_preconditioned.in.cu
@@ -0,0 +1,28 @@
+#include <dslash_wilson_clover_preconditioned.hpp>
+
+/**
+   This is the Wilson-clover linear operator
+*/
+
+namespace quda
+{
+
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct WilsonCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template WilsonCloverPreconditionedApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::WilsonCloverPreconditionedApply(
+                          cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                          cvector_ref<const ColorSpinorField> &x, const GaugeField &U, const CloverField &A, double a,
+                          double alpha0, int t0, int parity, bool dagger, const int *comm_override,
+                          DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
+
diff --git a/lib/dslash_wilson_clover_preconditioned_distance.cu b/lib/dslash_wilson_clover_preconditioned_distance.cu
index eac8ed1ff9..33c65dd9a7 100644
--- a/lib/dslash_wilson_clover_preconditioned_distance.cu
+++ b/lib/dslash_wilson_clover_preconditioned_distance.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_wilson_clover_preconditioned.hpp>
+#undef SIGNATURE_ONLY
 
 /**
    This is the Wilson-clover preconditioned linear operator
diff --git a/lib/dslash_wilson_distance.cu b/lib/dslash_wilson_distance.cu
index 414924f992..aad9d3a7e6 100644
--- a/lib/dslash_wilson_distance.cu
+++ b/lib/dslash_wilson_distance.cu
@@ -1,4 +1,6 @@
+#define SIGNATURE_ONLY
 #include <dslash_wilson.hpp>
+#undef SIGNATURE_ONLY
 
 namespace quda
 {
diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp
index 40a1351656..0b3657102b 100644
--- a/lib/gauge_field.cpp
+++ b/lib/gauge_field.cpp
@@ -81,6 +81,8 @@ namespace quda {
       errorQuda("Cannot request a 12/8 reconstruct type without SU(3) link type");
     if (param.reconstruct == QUDA_RECONSTRUCT_10 && param.link_type != QUDA_ASQTAD_MOM_LINKS)
       errorQuda("10-reconstruction only supported with momentum links");
+    if (param.nFace > x[0] || param.nFace > x[1] || param.nFace > x[2] || param.nFace > x[3])
+      errorQuda("Halo depth %d is greater than local lattice x = {%d %d %d %d}", param.nFace, x[0], x[1], x[2], x[3]);
 
     nColor = param.nColor;
     nFace = param.nFace;
diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp
index a4713a16b5..b045ac54ad 100644
--- a/lib/inv_mr_quda.cpp
+++ b/lib/inv_mr_quda.cpp
@@ -25,12 +25,23 @@ namespace quda
     Solver::create(x, b);
 
     if (!init || r.size() != b.size()) {
+
       resize(r, b.size(), QUDA_NULL_FIELD_CREATE, b[0]);
 
-      // now allocate sloppy fields
       ColorSpinorParam csParam(b[0]);
       csParam.create = QUDA_NULL_FIELD_CREATE;
       csParam.setPrecision(param.precision_sloppy);
+
+
+      // Setting the value of block_dim and checking if blocks are local
+      bool local = true;
+      for (int i = 0; i < QUDA_MAX_DIM; i++) {
+        csParam.dd.block_dim[i] = param.schwarz_block[i];
+        local &= (param.do_block_schwarz() && x.full_dim(i) % csParam.dd.block_dim[i] == 0);
+      }
+      // Disabling global_reduction if blocks are local and we do block_schwarz
+      if (param.do_block_schwarz() and param.global_reduction) param.global_reduction = not local;
+
       resize(Ar, b.size(), csParam);
       resize(x_sloppy, b.size(), csParam);
 
@@ -88,20 +99,38 @@ namespace quda
     while (!is_done) {
 
       int k = 0;
+
       vector<double> scale(b.size(), 1.0);
       vector<double> scale_inv(b.size(), 1.0);
       vector<double> delta2(b.size(), param.delta * param.delta);
 
-      if ((node_parity + step) % 2 == 0 && param.schwarz_type == QUDA_MULTIPLICATIVE_SCHWARZ) {
+      if (!param.do_block_schwarz() && param.schwarz_type == QUDA_MULTIPLICATIVE_SCHWARZ
+	    	      && (node_parity + step) % 2 == 0) {
         // for multiplicative Schwarz we alternate updates depending on node parity
       } else {
 
         commGlobalReductionPush(param.global_reduction); // use local reductions for DD solver
 
+        if (param.do_block_schwarz()) {
+          if (param.schwarz_type == QUDA_MULTIPLICATIVE_SCHWARZ) {
+            for (auto i = 0u; i < b.size(); i++) {
+            // Red or black active
+              (Ar[i]).DD(DD::reset, DD::red_black_type, step % 2 == 0 ? DD::red_active : DD::black_active);
+              (r_sloppy[i]).DD(DD::red_black_type, step % 2 == 0 ? DD::red_active : DD::black_active);
+	    }
+          } else {
+            // Both red and black active but no hopping
+            for (auto i = 0u; i < b.size(); i++) {
+              (Ar[i]).DD(DD::reset, DD::red_black_type, DD::red_active, DD::black_active, DD::no_block_hopping);
+              (r_sloppy[i]).DD(DD::reset, DD::red_black_type, DD::red_active, DD::black_active, DD::no_block_hopping);
+	    }
+          }
+        }
+
         blas::zero(x_sloppy); // can get rid of this for a special first update kernel
-        auto c2 = param.global_reduction == QUDA_BOOLEAN_TRUE ? r2 : blas::norm2(r); // c2 holds the initial r2
+        auto c2 = param.global_reduction == QUDA_BOOLEAN_TRUE ? r2 : blas::norm2(r); // c2 holds the initial r2  
         for (auto i = 0u; i < b.size(); i++) {
-          scale[i] = c2[i] > 0.0 ? sqrt(c2[i]) : 1.0;
+	  scale[i] = c2[i] > 0.0 ? sqrt(c2[i]) : 1.0;
           scale_inv[i] = 1.0 / scale[i];
           // domain-wise normalization of the initial residual to prevent underflow
           if (c2[i] > 0.0) r2[i] = 1.0; // by definition by this is now true
@@ -126,6 +155,7 @@ namespace quda
           } else {
             // doing local reductions so can make it asynchronous
             commAsyncReductionSet(true);
+
             blas::cDotProductNormAB(Ar, r_sloppy);
 
             // omega*alpha is done in the kernel
@@ -139,6 +169,14 @@ namespace quda
 
         blas::axpy(scale, x_sloppy, x); // Scale and sum to accumulator
 
+        if (param.do_block_schwarz()) {
+          // Disable domain decomposition
+          for (auto i = 0u; i < Ar.size(); i++) {
+            (Ar[i]).DD(DD::reset);
+            (r_sloppy[i]).DD(DD::reset);
+	  }
+        }
+
         commGlobalReductionPop(); // renable global reductions for outer solver
       }
 
diff --git a/lib/laplace.cu b/lib/laplace.cu
index 5cee381311..cd0c6c43cd 100644
--- a/lib/laplace.cu
+++ b/lib/laplace.cu
@@ -1,176 +1,10 @@
-#include <dslash.h>
-#include <worker.h>
-#include <dslash_helper.cuh>
-#include <color_spinor_field_order.h>
-#include <gauge_field_order.h>
-#include <color_spinor.h>
-#include <dslash_helper.cuh>
-#include <index_helper.cuh>
-#include <gauge_field.h>
-#include <uint_to_char.h>
-
-#include <dslash_policy.hpp>
-#include <kernels/laplace.cuh>
-
-/**
-   This is the laplacian derivative based on the basic gauged differential operator
-*/
+#define SIGNATURE_ONLY
+#include <laplace.hpp>
+#undef SIGNATURE_ONLY
 
 namespace quda
 {
 
-  template <typename Arg> class Laplace : public Dslash<laplace, Arg>
-  {
-    using Dslash = Dslash<laplace, Arg>;
-    using Dslash::arg;
-    using Dslash::halo;
-    using Dslash::in;
-
-  public:
-    Laplace(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-            const ColorSpinorField &halo) :
-      Dslash(arg, out, in, halo)
-    {
-    }
-
-    void apply(const qudaStream_t &stream) override
-    {
-      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
-      Dslash::setParam(tp);
-
-      // operator is Hermitian so do not instantiate dagger
-      if (arg.nParity == 1) {
-        if (arg.xpay)
-          Dslash::template instantiate<packStaggeredShmem, 1, false, true>(tp, stream);
-        else
-          Dslash::template instantiate<packStaggeredShmem, 1, false, false>(tp, stream);
-      } else if (arg.nParity == 2) {
-        if (arg.xpay)
-          Dslash::template instantiate<packStaggeredShmem, 2, false, true>(tp, stream);
-        else
-          Dslash::template instantiate<packStaggeredShmem, 2, false, false>(tp, stream);
-      }
-    }
-
-    long long flops() const override
-    {
-      int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
-      int ghost_flops = (in.Nspin() * mv_flops + 2 * in.Ncolor() * in.Nspin());
-      int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
-      int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);      // 3D or 4D operator
-
-      long long flops_ = 0;
-
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T:
-        flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * halo.GhostFace()[arg.kernel_type];
-        break;
-      case EXTERIOR_KERNEL_ALL: {
-        long long ghost_sites
-          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
-        break;
-      }
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: {
-        long long sites = halo.Volume();
-        flops_ = (num_dir * in.Nspin() * mv_flops +                  // SU(3) matrix-vector multiplies
-                  ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
-          * sites; // accumulation
-        if (arg.xpay) flops_ += xpay_flops * sites;
-
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for flops done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        flops_ -= ghost_flops * ghost_sites;
-
-        break;
-      }
-      }
-
-      return flops_;
-    }
-
-    virtual long long bytes() const override
-    {
-      int gauge_bytes = arg.reconstruct * in.Precision();
-      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
-      int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
-      int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial
-      int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);                           // 3D or 4D operator
-
-      long long bytes_ = 0;
-
-      switch (arg.kernel_type) {
-      case EXTERIOR_KERNEL_X:
-      case EXTERIOR_KERNEL_Y:
-      case EXTERIOR_KERNEL_Z:
-      case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * halo.GhostFace()[arg.kernel_type]; break;
-      case EXTERIOR_KERNEL_ALL: {
-        long long ghost_sites
-          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
-        bytes_ = ghost_bytes * ghost_sites;
-        break;
-      }
-      case INTERIOR_KERNEL:
-      case UBER_KERNEL:
-      case KERNEL_POLICY: {
-        long long sites = halo.Volume();
-        bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
-        if (arg.xpay) bytes_ += spinor_bytes;
-	
-        if (arg.kernel_type == KERNEL_POLICY) break;
-        // now correct for bytes done by exterior kernel
-        long long ghost_sites = 0;
-        for (int d = 0; d < 4; d++)
-          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
-        bytes_ -= ghost_bytes * ghost_sites;
-	
-        break;
-      }
-      }
-      return bytes_;
-    }
-    
-    TuneKey tuneKey() const override
-    { // add laplace transverse dir to the key
-      auto key = Dslash::tuneKey();
-      strcat(key.aux, ",laplace=");
-      u32toa(key.aux + strlen(key.aux), arg.dir);
-      return key;
-    }
-  };
-
-  template <typename Float, int nColor, QudaReconstructType recon> struct LaplaceApply {
-
-    LaplaceApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
-                 cvector_ref<const ColorSpinorField> &x, const GaugeField &U, int dir, double a, double b, int parity,
-                 const int *comm_override, TimeProfile &profile)
-    {
-      constexpr int nDim = 4;
-      auto halo = ColorSpinorField::create_comms_batch(in, 1, false);
-      if (in.Nspin() == 1) {
-        constexpr int nSpin = 1;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in, halo);
-        dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
-      } else if (in.Nspin() == 4) {
-        constexpr int nSpin = 4;
-        LaplaceArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, dir, a, b, x, parity, comm_override);
-        Laplace<decltype(arg)> laplace(arg, out, in, halo);
-        dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
-      } else {
-        errorQuda("Unsupported nSpin= %d", in.Nspin());
-      }
-    }
-  };
-
   // Apply the Laplace operator
   // out(x) = M*in = - a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) + b*in(x)
   // Omits direction 'dir' from the operator.
@@ -179,7 +13,8 @@ namespace quda
                     const int *comm_override, TimeProfile &profile)
   {
     if constexpr (is_enabled<QUDA_LAPLACE_DSLASH>()) {
-      instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, comm_override, profile);
+      auto dummy = DistanceType<false>();
+      instantiate<LaplaceApply>(out, in, x, U, dir, a, b, parity, comm_override, dummy, profile);
     } else {
       errorQuda("Laplace operator has not been enabled");
     }
diff --git a/lib/laplace.hpp b/lib/laplace.hpp
new file mode 100644
index 0000000000..ae8e7de559
--- /dev/null
+++ b/lib/laplace.hpp
@@ -0,0 +1,182 @@
+#include <dslash.h>
+#include <worker.h>
+#include <dslash_helper.cuh>
+#include <color_spinor_field_order.h>
+#include <gauge_field_order.h>
+#include <color_spinor.h>
+#include <dslash_helper.cuh>
+#include <index_helper.cuh>
+#include <gauge_field.h>
+#include <uint_to_char.h>
+
+#include <dslash_policy.hpp>
+#include <kernels/laplace.cuh>
+
+/**
+   This is the laplacian derivative based on the basic gauged differential operator
+*/
+
+namespace quda
+{
+
+  template <typename Arg> class Laplace : public Dslash<laplace, Arg>
+  {
+    using Dslash = Dslash<laplace, Arg>;
+    using Dslash::arg;
+    using Dslash::halo;
+    using Dslash::in;
+
+  public:
+    Laplace(Arg &arg, cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+            const ColorSpinorField &halo) :
+      Dslash(arg, out, in, halo)
+    {
+    }
+
+    void apply(const qudaStream_t &stream) override
+    {
+      TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
+      Dslash::setParam(tp);
+
+      // operator is Hermitian so do not instantiate dagger
+      if (arg.nParity == 1) {
+        if (arg.xpay)
+          Dslash::template instantiate<packStaggeredShmem, 1, false, true>(tp, stream);
+        else
+          Dslash::template instantiate<packStaggeredShmem, 1, false, false>(tp, stream);
+      } else if (arg.nParity == 2) {
+        if (arg.xpay)
+          Dslash::template instantiate<packStaggeredShmem, 2, false, true>(tp, stream);
+        else
+          Dslash::template instantiate<packStaggeredShmem, 2, false, false>(tp, stream);
+      }
+    }
+
+    long long flops() const override
+    {
+      int mv_flops = (8 * in.Ncolor() - 2) * in.Ncolor(); // SU(3) matrix-vector flops
+      int ghost_flops = (in.Nspin() * mv_flops + 2 * in.Ncolor() * in.Nspin());
+      int xpay_flops = 2 * 2 * in.Ncolor() * in.Nspin(); // multiply and add per real component
+      int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);      // 3D or 4D operator
+
+      long long flops_ = 0;
+
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T:
+        flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * 2 * halo.GhostFace()[arg.kernel_type];
+        break;
+      case EXTERIOR_KERNEL_ALL: {
+        long long ghost_sites
+          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        flops_ = (ghost_flops + (arg.xpay ? xpay_flops : xpay_flops / 2)) * ghost_sites;
+        break;
+      }
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: {
+        long long sites = halo.Volume();
+        flops_ = (num_dir * in.Nspin() * mv_flops +                  // SU(3) matrix-vector multiplies
+                  ((num_dir - 1) * 2 * in.Ncolor() * in.Nspin()))
+          * sites; // accumulation
+        if (arg.xpay) flops_ += xpay_flops * sites;
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for flops done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        flops_ -= ghost_flops * ghost_sites;
+
+        break;
+      }
+      }
+
+      return flops_;
+    }
+
+    virtual long long bytes() const override
+    {
+      int gauge_bytes = arg.reconstruct * in.Precision();
+      int spinor_bytes = 2 * in.Ncolor() * in.Nspin() * in.Precision() + (isFixed<typename Arg::Float>::value ? sizeof(float) : 0);
+      int proj_spinor_bytes = in.Nspin() == 4 ? spinor_bytes / 2 : spinor_bytes;
+      int ghost_bytes = (proj_spinor_bytes + gauge_bytes) + 2 * spinor_bytes; // 2 since we have to load the partial
+      int num_dir = (arg.dir == 4 ? 2 * 4 : 2 * 3);                           // 3D or 4D operator
+
+      long long bytes_ = 0;
+
+      switch (arg.kernel_type) {
+      case EXTERIOR_KERNEL_X:
+      case EXTERIOR_KERNEL_Y:
+      case EXTERIOR_KERNEL_Z:
+      case EXTERIOR_KERNEL_T: bytes_ = ghost_bytes * 2 * halo.GhostFace()[arg.kernel_type]; break;
+      case EXTERIOR_KERNEL_ALL: {
+        long long ghost_sites
+          = 2 * (halo.GhostFace()[0] + halo.GhostFace()[1] + halo.GhostFace()[2] + halo.GhostFace()[3]);
+        bytes_ = ghost_bytes * ghost_sites;
+        break;
+      }
+      case INTERIOR_KERNEL:
+      case UBER_KERNEL:
+      case KERNEL_POLICY: {
+        long long sites = halo.Volume();
+        bytes_ = (num_dir * gauge_bytes + ((num_dir - 2) * spinor_bytes + 2 * proj_spinor_bytes) + spinor_bytes) * sites;
+        if (arg.xpay) bytes_ += spinor_bytes;
+
+        if (arg.kernel_type == KERNEL_POLICY) break;
+        // now correct for bytes done by exterior kernel
+        long long ghost_sites = 0;
+        for (int d = 0; d < 4; d++)
+          if (arg.commDim[d]) ghost_sites += 2 * halo.GhostFace()[d];
+        bytes_ -= ghost_bytes * ghost_sites;
+
+        break;
+      }
+      }
+      return bytes_;
+    }
+
+    TuneKey tuneKey() const override
+    { // add laplace transverse dir to the key
+      auto key = Dslash::tuneKey();
+      strcat(key.aux, ",laplace=");
+      u32toa(key.aux + strlen(key.aux), arg.dir);
+      return key;
+    }
+  };
+
+  template <bool distance_pc> struct DistanceType {
+  };
+
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct LaplaceApply {
+    template <bool distance_pc>
+    LaplaceApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
+                 cvector_ref<const ColorSpinorField> &x, const GaugeField &U, int dir, double a, double b, int parity,
+                 const int *comm_override, DistanceType<distance_pc>, TimeProfile &profile)
+#ifdef SIGNATURE_ONLY // Used to hide from the compiler the implementation of the function
+      ;
+#else
+    {
+      constexpr int nDim = 4;
+      auto halo = ColorSpinorField::create_comms_batch(in, 1, false);
+      if (in.Nspin() == 1) {
+        constexpr int nSpin = 1;
+        LaplaceArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, dir, a, b, x, parity,
+                                                                 comm_override);
+        Laplace<decltype(arg)> laplace(arg, out, in, halo);
+        dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
+      } else if (in.Nspin() == 4) {
+        constexpr int nSpin = 4;
+        LaplaceArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, dir, a, b, x, parity,
+                                                                 comm_override);
+        Laplace<decltype(arg)> laplace(arg, out, in, halo);
+        dslash::DslashPolicyTune<decltype(laplace)> policy(laplace, in, halo, profile);
+      } else {
+        errorQuda("Unsupported nSpin= %d", in.Nspin());
+      }
+    }
+#endif
+  };
+} // namespace quda
diff --git a/lib/laplace.in.cu b/lib/laplace.in.cu
new file mode 100644
index 0000000000..31518b4327
--- /dev/null
+++ b/lib/laplace.in.cu
@@ -0,0 +1,21 @@
+#include <laplace.hpp>
+
+namespace quda
+{
+
+  constexpr QudaPrecision precision = QUDA_@QUDA_DSLASH_PREC@_PRECISION;
+  constexpr int nColor = @QUDA_DSLASH_NCOLOR@;
+  constexpr int reconI = @QUDA_DSLASH_RECONI@;
+  constexpr bool distance_pc = @QUDA_DSLASH_DISTANCE@;
+
+  typedef @QUDA_DSLASH_DDARG@ DDArg;
+  typedef precision_type_mapper<precision>::type Float;
+
+  template struct LaplaceApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>;
+
+  template LaplaceApply<Float, nColor, DDArg, ReconstructWilson::recon[reconI]>::LaplaceApply(
+    cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in, cvector_ref<const ColorSpinorField> &x,
+    const GaugeField &U, int dir, double a, double b, int parity, const int *comm_override,
+    DistanceType<distance_pc>, TimeProfile &profile);
+
+} // namespace quda
diff --git a/lib/staggered_quark_smearing.cu b/lib/staggered_quark_smearing.cu
index 89a10b9d1c..013af7034c 100644
--- a/lib/staggered_quark_smearing.cu
+++ b/lib/staggered_quark_smearing.cu
@@ -186,7 +186,7 @@ namespace quda
     }
   };
 
-  template <typename Float, int nColor, QudaReconstructType recon> struct StaggeredQSmearApply {
+  template <typename Float, int nColor, typename DDArg, QudaReconstructType recon> struct StaggeredQSmearApply {
     StaggeredQSmearApply(cvector_ref<ColorSpinorField> &out, cvector_ref<const ColorSpinorField> &in,
                          cvector_ref<const ColorSpinorField> &, const GaugeField &U, int t0, bool is_tslice_kernel,
                          int parity, int dir, bool dagger, const int *comm_override, TimeProfile &profile)
@@ -196,8 +196,8 @@ namespace quda
         constexpr int nSpin = 1;
 
         auto halo = ColorSpinorField::create_comms_batch(in, 3);
-        StaggeredQSmearArg<Float, nSpin, nColor, nDim, recon> arg(out, in, halo, U, t0, is_tslice_kernel, parity, dir,
-                                                                  dagger, comm_override);
+        StaggeredQSmearArg<Float, nSpin, nColor, nDim, DDArg, recon> arg(out, in, halo, U, t0, is_tslice_kernel, parity,
+                                                                         dir, dagger, comm_override);
         StaggeredQSmear<decltype(arg)> staggered_qsmear(arg, out, in, halo);
         dslash::DslashPolicyTune<decltype(staggered_qsmear)> policy(staggered_qsmear, in, halo, profile);
       } else {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index da3e7a97bf..0f0f14ee2f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -425,17 +425,18 @@ foreach(pol IN LISTS DSLASH_POLICIES)
       set_tests_properties(dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
-    add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2}
+    add_test(NAME dslash_${DIRAC_NAME}_splitgrid_policy${pol2}_DD
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
                      --all-partitions 0
+		     --domain-decomposition 1
                      --test Dslash
-                     --dim 2 4 6 8
-                     --gtest_output=xml:dslash_${DIRAC_NAME}_splitgrid_test_pol${pol2}.xml)
+                     --dim 4 2 6 8
+                     --gtest_output=xml:dslash_${DIRAC_NAME}_splitgrid_test_pol${pol2}_DD.xml)
     if(polenv)
-      set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+      set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2}_DD PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
-    set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
+    set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2}_DD PROPERTIES ENVIRONMENT QUDA_TEST_GRID_PARTITION=$ENV{QUDA_TEST_GRID_SIZE})
 
     add_test(NAME benchmark_dslash_${DIRAC_NAME}_policy${pol2}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
@@ -449,7 +450,20 @@ foreach(pol IN LISTS DSLASH_POLICIES)
     if(polenv)
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
-endif()
+    
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+  endif()
 
   if(QUDA_DIRAC_CLOVER)
     set(DIRAC_NAME clover)
@@ -467,7 +481,6 @@ endif()
     endif()
 
     # asymmetric preconditioning
-    set(DIRAC_NAME clover)
     add_test(NAME dslash_${DIRAC_NAME}_asym_policy${pol2}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
@@ -491,6 +504,17 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_CLOVER_HASENBUSCH)
@@ -532,6 +556,17 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_TWISTED_MASS)
@@ -573,6 +608,17 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_NDEG_TWISTED_MASS)
@@ -619,6 +665,19 @@ endif()
     if(polenv)
       set_tests_properties(benchmark_dslash_ndeg_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
+
+    set(TEST_NAME dslash_ndeg_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --flavor nondeg-doublet
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_TWISTED_CLOVER)
@@ -662,6 +721,17 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_NDEG_TWISTED_CLOVER)
@@ -710,6 +780,18 @@ endif()
       set_tests_properties(benchmark_dslash_ndeg_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_ndeg_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --flavor nondeg-doublet
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_DOMAIN_WALL)
@@ -741,6 +823,19 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_ndeg_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --test Mat
+		     --domain-decomposition 1
+                     --flavor nondeg-doublet
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+
     # 4-d preconditioned
     set(DIRAC_NAME domain-wall-4d)
 
@@ -783,6 +878,19 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+
     # Mobius fermions
     set(DIRAC_NAME mobius)
 
@@ -868,6 +976,19 @@ endif()
     if(polenv)
       set_tests_properties(dslash_${DIRAC_NAME}_asym_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
+
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
   endif()
 
   if(QUDA_DIRAC_STAGGERED)
@@ -876,7 +997,7 @@ endif()
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
                      --test MatPC
-                     --dim 2 4 6 8
+                     --dim 4 2 6 8
                      --gtest_output=xml:dslash_${DIRAC_NAME}_matpc_test_pol${pol2}.xml)
     if(polenv)
       set_tests_properties(dslash_${DIRAC_NAME}_matpc_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
@@ -886,7 +1007,7 @@ endif()
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
                      --test Mat
-                     --dim 2 4 6 8
+                     --dim 4 2 6 8
                      --gtest_output=xml:dslash_${DIRAC_NAME}_mat_test_pol${pol2}.xml)
     if(polenv)
       set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
@@ -896,7 +1017,7 @@ endif()
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
                      --dslash-type ${DIRAC_NAME}
                      --test MatPC
-                     --dim 2 4 6 8
+                     --dim 4 2 6 8
                      --gtest_output=xml:dslash_${DIRAC_NAME}_matpc_test_pol${pol2}.xml)
     if(polenv)
       set_tests_properties(dslash_${DIRAC_NAME}_splitgrid_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
@@ -915,6 +1036,19 @@ endif()
       set_tests_properties(benchmark_dslash_${DIRAC_NAME}_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 4 2 6 8
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+
     set(DIRAC_NAME asqtad)
     add_test(NAME dslash_${DIRAC_NAME}_matpc_policy${pol2}
              COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
@@ -965,13 +1099,26 @@ endif()
       set_tests_properties(dslash_${DIRAC_NAME}_build_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
     endif()
 
+    set(TEST_NAME dslash_${DIRAC_NAME}_mat_policy${pol2}_DD)
+    add_test(NAME ${TEST_NAME}
+             COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
+                     --dslash-type ${DIRAC_NAME}
+                     --all-partitions 0
+                     --test Mat
+		     --domain-decomposition 1
+                     --dim 8 6 10 12
+                     --gtest_output=xml:${TEST_NAME}.xml)
+    if(polenv)
+      set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol})
+    endif()
+
     if(QUDA_DIRAC_LAPLACE)
       set(DIRAC_NAME laplace)
       add_test(NAME dslash_${DIRAC_NAME}_mat_policy${pol2}
                COMMAND ${QUDA_CTEST_LAUNCH} $<TARGET_FILE:staggered_dslash_ctest> ${MPIEXEC_POSTFLAGS}
                        --dslash-type ${DIRAC_NAME}
                        --test Mat
-                       --dim 2 4 6 8
+                       --dim 4 2 6 8
                        --gtest_output=xml:dslash_${DIRAC_NAME}_mat_test_pol${pol2}.xml)
       if(polenv)
         set_tests_properties(dslash_${DIRAC_NAME}_mat_policy${pol2} PROPERTIES ENVIRONMENT QUDA_ENABLE_DSLASH_POLICY=${pol2})
diff --git a/tests/dslash_ctest.cpp b/tests/dslash_ctest.cpp
index 631d640203..21141f93a2 100644
--- a/tests/dslash_ctest.cpp
+++ b/tests/dslash_ctest.cpp
@@ -7,6 +7,7 @@ int argc_copy;
 char **argv_copy;
 dslash_test_type dtest_type = dslash_test_type::Dslash;
 bool ctest_all_partitions = false;
+bool ctest_domain_decomposition = false;
 
 // For googletest names must be non-empty, unique, and may only contain ASCII
 // alphanumeric characters or underscore
@@ -17,10 +18,10 @@ using ::testing::Range;
 using ::testing::TestWithParam;
 using ::testing::Values;
 
-class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, int>>
+class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, int, int, int>>
 {
 protected:
-  ::testing::tuple<int, int, int> param;
+  ::testing::tuple<int, int, int, int, int> param;
 
   bool skip()
   {
@@ -41,6 +42,9 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
                                                   true, false, false, false, true, false, true, true};
     if (!ctest_all_partitions && !partition_enabled[::testing::get<2>(GetParam())]) return true;
 
+    if (::testing::get<3>(GetParam()) == 0 && ::testing::get<4>(GetParam()) > 0) return true;
+    if (!ctest_domain_decomposition && ::testing::get<3>(GetParam())>0) return true;
+
     return false;
   }
 
@@ -61,6 +65,12 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
       printfQuda("Testing with split grid: %d  %d  %d  %d\n", grid_partition[0], grid_partition[1], grid_partition[2],
                  grid_partition[3]);
     }
+
+    if (dslash_test_wrapper.test_domain_decomposition) {
+      if (dd_red_black)
+        printfQuda("Testing DD Red Black with block: %d  %d  %d  %d\n", dd_block_size[0], dd_block_size[1],
+                   dd_block_size[2], dd_block_size[3]);
+    }
   }
 
 public:
@@ -77,7 +87,9 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
     }
     updateR();
 
-    dslash_test_wrapper.init_ctest(argc_copy, argv_copy, prec, recon);
+    int dd_value = ::testing::get<3>(GetParam());
+    int dd_color = ::testing::get<4>(GetParam());
+    dslash_test_wrapper.init_ctest(argc_copy, argv_copy, prec, recon, dd_value, dd_color);
     display_test_info(prec, recon);
   }
 
@@ -90,7 +102,6 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
 
   static void SetUpTestCase()
   {
-    initQuda(device_ordinal);
     DslashTestWrapper::dtest_type = dtest_type;
   }
 
@@ -100,13 +111,12 @@ class DslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, in
   static void TearDownTestCase()
   {
     DslashTestWrapper::destroy();
-    endQuda();
   }
 };
 
 TEST_P(DslashTest, verify)
 {
-  dslash_test_wrapper.dslashRef();
+  if (not dslash_test_wrapper.test_domain_decomposition) dslash_test_wrapper.dslashRef();
   dslash_test_wrapper.run_test(2);
 
   double deviation = dslash_test_wrapper.verify();
@@ -131,6 +141,7 @@ int main(int argc, char **argv)
   auto app = make_app();
   app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map));
   app->add_option("--all-partitions", ctest_all_partitions, "Test all instead of reduced combination of partitions");
+  app->add_option("--domain-decomposition", ctest_domain_decomposition, "Test domain decomposition");
   add_comms_option_group(app);
   try {
     app->parse(argc, argv);
@@ -139,6 +150,7 @@ int main(int argc, char **argv)
   }
 
   initComms(argc, argv, gridsize_from_cmdline);
+  initQuda(device_ordinal);
 
   // The 'SetUp()' method of the Google Test class from which DslashTest
   // in derived has no arguments, but QUDA's implementation requires the
@@ -153,34 +165,63 @@ int main(int argc, char **argv)
 
   int test_rc = RUN_ALL_TESTS();
 
+  endQuda();
   finalizeComms();
   return test_rc;
 }
 
-std::string getdslashtestname(testing::TestParamInfo<::testing::tuple<int, int, int>> param)
+std::string getdslashtestname(testing::TestParamInfo<::testing::tuple<int, int, int, int, int>> param)
 {
   const int prec = ::testing::get<0>(param.param);
   const int recon = ::testing::get<1>(param.param);
   const int part = ::testing::get<2>(param.param);
+  const int dd = ::testing::get<3>(param.param);
+  const int col = ::testing::get<4>(param.param);
   std::stringstream ss;
-  // std::cout << "getdslashtestname" << get_dslash_str(dslash_type) << "_" << prec_str[prec] << "_r" << recon <<
-  // "_partition" << part << std::endl; ss << get_dslash_str(dslash_type) << "_";
   ss << get_prec_str(getPrecision(prec));
   ss << "_r" << recon;
   ss << "_partition" << part;
+  if (dd > 0) {
+    switch (dd) {
+    case 1: ss << "_dd_local"; break;
+    case 2: ss << "_dd_global"; break;
+    }
+    switch (col) {
+    case 0: ss << "_red_red"; break;
+    case 1: ss << "_black_red"; break;
+    case 2: ss << "_red_black"; break;
+    case 3: ss << "_black_black"; break;
+    }
+  } else if (col > 0) {
+    ss << "_skipped" << col;
+  }
   return ss.str();
 }
 
 #ifdef MULTI_GPU
-INSTANTIATE_TEST_SUITE_P(QUDA, DslashTest,
+#define N_PARTITIONS 16
+#else
+#define N_PARTITIONS 1
+#endif
+
+// regular tests
+INSTANTIATE_TEST_SUITE_P(Regular, DslashTest,
                          Combine(Range(0, 4),
                                  ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8),
-                                 Range(0, 16)),
+                                 Range(0, N_PARTITIONS), ::testing::Values(0), ::testing::Values(0)),
                          getdslashtestname);
-#else
-INSTANTIATE_TEST_SUITE_P(QUDA, DslashTest,
+
+#if QUDA_DOMAIN_DECOMPOSITION > 0
+#define N_DD_TESTS 3
+
+// DD tests
+INSTANTIATE_TEST_SUITE_P(DD, DslashTest,
                          Combine(Range(0, 4),
                                  ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8),
-                                 ::testing::Values(0)),
+                                 Range(0, N_PARTITIONS), Range(1, N_DD_TESTS), Range(0, 4)),
                          getdslashtestname);
+
 #endif
+
+#undef N_PARTITIONS
+#undef N_DD_TESTS
diff --git a/tests/dslash_test.cpp b/tests/dslash_test.cpp
index 11f8265e7f..afc1462764 100644
--- a/tests/dslash_test.cpp
+++ b/tests/dslash_test.cpp
@@ -30,6 +30,12 @@ class DslashTest : public ::testing::Test
       printfQuda("Testing with split grid: %d  %d  %d  %d\n", grid_partition[0], grid_partition[1], grid_partition[2],
                  grid_partition[3]);
     }
+
+    if (dslash_test_wrapper.test_domain_decomposition) {
+      if (dd_red_black)
+        printfQuda("Testing DD Red Black with block: %d  %d  %d  %d\n", dd_block_size[0], dd_block_size[1],
+                   dd_block_size[2], dd_block_size[3]);
+    }
   }
 
 public:
@@ -63,7 +69,7 @@ TEST_F(DslashTest, verify)
 {
   if (!verify_results) GTEST_SKIP();
 
-  dslash_test_wrapper.dslashRef();
+  if (not dslash_test_wrapper.test_domain_decomposition) dslash_test_wrapper.dslashRef();
   dslash_test_wrapper.run_test(2);
 
   double deviation = dslash_test_wrapper.verify();
@@ -89,6 +95,7 @@ int main(int argc, char **argv)
   auto app = make_app();
   app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map));
   add_eofa_option_group(app);
+  add_dd_option_group(app);
   add_comms_option_group(app);
 
   try {
diff --git a/tests/dslash_test_utils.h b/tests/dslash_test_utils.h
index 5512c3d0a2..d8269cad08 100644
--- a/tests/dslash_test_utils.h
+++ b/tests/dslash_test_utils.h
@@ -63,6 +63,7 @@ struct DslashTestWrapper {
   // CUDA color spinor fields
   std::vector<ColorSpinorField> cudaSpinor;
   std::vector<ColorSpinorField> cudaSpinorOut;
+  std::vector<ColorSpinorField> cudaSpinorTmp;
 
   // Dirac pointers
   quda::Dirac *dirac = nullptr;
@@ -83,11 +84,14 @@ struct DslashTestWrapper {
   QudaParity parity = QUDA_EVEN_PARITY;
   static inline dslash_test_type dtest_type = dslash_test_type::Dslash;
   static inline bool test_split_grid = false;
+  static inline bool test_domain_decomposition = false;
   int num_src = 1;
 
+  static inline int dd_col = 0;
+
   const bool transfer = false;
 
-  void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon)
+  void init_ctest(int argc, char **argv, int precision, QudaReconstructType link_recon, int dd_value, int dd_color)
   {
     if (first_time) {
       gauge_param = newQudaGaugeParam();
@@ -115,6 +119,8 @@ struct DslashTestWrapper {
     inv_param.clover_cuda_prec_precondition = cuda_prec;
     inv_param.clover_cuda_prec_refinement_sloppy = cuda_prec;
 
+    init_domain_decomposition(dd_value, dd_color);
+
     init();
   }
 
@@ -146,6 +152,8 @@ struct DslashTestWrapper {
 
     if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor CPU precisions must match");
 
+    test_domain_decomposition = dd_red_black;
+
     for (int i = 0; i < 4; i++) inv_param.split_grid[i] = grid_partition[i];
     num_src = grid_partition[0] * grid_partition[1] * grid_partition[2] * grid_partition[3];
     test_split_grid = num_src > 1;
@@ -275,6 +283,36 @@ struct DslashTestWrapper {
     inv_param.verbosity = verbosity;
   }
 
+  void init_domain_decomposition(int value, int color)
+  {
+    if (value == 0) {
+      test_domain_decomposition = false;
+      return;
+    }
+    test_domain_decomposition = true;
+    dd_col = color;
+
+    if (value < 3) {
+      dd_red_black = true;
+
+      // dd_block_size is half of the local lattice
+      if (value == 1) {
+        for (auto i = 0u; i < 4; i++) dd_block_size[i] = gauge_param.X[i] / 2;
+        return;
+      }
+
+      // dd_block_size is half of the global lattice
+      if (value == 2) {
+        for (auto i = 0u; i < 4; i++) dd_block_size[i] = (gauge_param.X[i] * comm_dim(i)) / 2;
+        return;
+      }
+
+    } else {
+      dd_red_black = false;
+    }
+    errorQuda("Unexpected value for domain decomposition (%d)", value);
+  }
+
   void init()
   {
     printfQuda("Sending gauge field to GPU\n");
@@ -297,11 +335,10 @@ struct DslashTestWrapper {
       csParam.setPrecision(inv_param.cuda_prec, inv_param.cuda_prec, true);
 
       printfQuda("Creating cudaSpinor with nParity = %d\n", csParam.siteSubset);
-      cudaSpinor.resize(Nsrc);
-      for (int i = 0; i < Nsrc; i++) cudaSpinor[i] = ColorSpinorField(csParam);
+      resize(cudaSpinor, Nsrc, csParam);
       printfQuda("Creating cudaSpinorOut with nParity = %d\n", csParam.siteSubset);
-      cudaSpinorOut.resize(Nsrc);
-      for (int i = 0; i < Nsrc; i++) cudaSpinorOut[i] = ColorSpinorField(csParam);
+      resize(cudaSpinorOut, Nsrc, csParam);
+      if (test_domain_decomposition) { resize(cudaSpinorTmp, Nsrc, csParam); }
 
       printfQuda("Sending spinor field to GPU\n");
       cudaSpinor = spinor;
@@ -340,12 +377,21 @@ struct DslashTestWrapper {
   static void destroy()
   {
     for (int dir = 0; dir < 4; dir++)
-      if (hostGauge[dir]) host_free(hostGauge[dir]);
+      if (hostGauge[dir]) {
+        host_free(hostGauge[dir]);
+        hostGauge[dir] = nullptr;
+      }
 
     if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH
         || dslash_type == QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH) {
-      if (hostClover) host_free(hostClover);
-      if (hostCloverInv) host_free(hostCloverInv);
+      if (hostClover) {
+        host_free(hostClover);
+        hostClover = nullptr;
+      }
+      if (hostCloverInv) {
+        host_free(hostCloverInv);
+        hostCloverInv = nullptr;
+      }
     }
 
     spinor = {};
@@ -358,6 +404,7 @@ struct DslashTestWrapper {
       vp_spinorOut.clear();
       vp_spinorRef.clear();
     }
+    first_time = true;
   }
 
   void dslashRef()
@@ -805,6 +852,80 @@ struct DslashTestWrapper {
 
       dslashMultiSrcQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity);
 
+    } else if (test_domain_decomposition) {
+
+      vector_ref<ColorSpinorField> spinor = cudaSpinor;
+      vector_ref<ColorSpinorField> out = cudaSpinorOut;
+      vector_ref<ColorSpinorField> tmp = cudaSpinorTmp;
+
+      if (dd_red_black) {
+        for (int n = 0; n < Nsrc; n++) {
+          for (auto i = 0u; i < 4; i++) {
+            cudaSpinor[n].DD().block_dim[i] = dd_block_size[i];
+            cudaSpinorOut[n].DD().block_dim[i] = dd_block_size[i];
+            cudaSpinorTmp[n].DD().block_dim[i] = dd_block_size[i];
+          }
+        }
+
+        blas::zero(cudaSpinorOut);
+        blas::zero(cudaSpinorTmp);
+
+        spinor.DD(DD::reset, DD::red_black_type, dd_col % 2 == 0 ? DD::red_active : DD::black_active);
+        out.DD(DD::reset, DD::red_black_type, dd_col / 2 == 1 ? DD::red_active : DD::black_active);
+
+        for (int i = 0; i < niter; i++) {
+          host_timer.start();
+          switch (dtest_type) {
+          case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break;
+          case dslash_test_type::MatPC:
+          case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
+          case dslash_test_type::MatPCDagMatPC:
+          case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinor); break;
+          default:
+            errorQuda("Test type %s not support for current Dslash", get_string(dtest_type_map, dtest_type).c_str());
+          }
+          host_timer.stop();
+
+          dslash_time.cpu_time += host_timer.last();
+          // skip first and last iterations since they may skew these metrics if comms are not synchronous
+          if (i > 0 && i < niter) {
+            dslash_time.cpu_min = std::min(dslash_time.cpu_min, host_timer.last());
+            dslash_time.cpu_max = std::max(dslash_time.cpu_max, host_timer.last());
+          }
+        }
+
+        spinor.DD(DD::reset);
+        out.DD(DD::reset);
+        spinorOut = cudaSpinorOut;
+
+        if (niter > 2) { // HACK: when benchmarking we do not produce reference solution
+          // We also test that Dyx is same as D applied to projected in and out spinors
+          blas::copy(tmp, cudaSpinor);
+          tmp.DD(DD::reset, DD::red_black_type, dd_col % 2 == 0 ? DD::red_active : DD::black_active);
+          tmp.projectDD();
+          tmp.DD(DD::reset);
+
+          blas::zero(cudaSpinorOut);
+          switch (dtest_type) {
+          case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinorTmp, parity); break;
+          case dslash_test_type::MatPC:
+          case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinorTmp); break;
+          case dslash_test_type::MatPCDagMatPC:
+          case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinorTmp); break;
+          default:
+            errorQuda("Test type %s not support for current Dslash", get_string(dtest_type_map, dtest_type).c_str());
+          }
+
+          out.DD(DD::reset, DD::red_black_type, dd_col / 2 == 0 ? DD::red_active : DD::black_active);
+          out.projectDD();
+          out.DD(DD::reset);
+
+          spinorRef = cudaSpinorOut;
+        }
+      } else {
+        errorQuda("Test dd type not supported");
+      }
+
     } else {
 
       for (int i = 0; i < niter; i++) {
@@ -1020,7 +1141,7 @@ struct DslashTestWrapper {
 
   void run_test(int niter, bool = false)
   {
-    {
+    if (getTuning()) {
       printfQuda("Tuning...\n");
       dslashCUDA(1); // warm-up run
     }
@@ -1036,7 +1157,7 @@ struct DslashTestWrapper {
     unsigned long long bytes = (quda::Tunable::bytes_global() - bytes0);
 
     if (!test_split_grid) {
-      if (!transfer) spinorOut = cudaSpinorOut;
+      if (!transfer && !test_domain_decomposition) spinorOut = cudaSpinorOut;
 
       // print timing information
       printfQuda("%fus per kernel call\n", 1e6 * dslash_time.event_time / niter);
@@ -1085,6 +1206,11 @@ struct DslashTestWrapper {
         deviation
           = std::max(deviation, std::pow(10, -(double)(ColorSpinorField::Compare(spinorRef[0], vp_spinorOut[n]))));
       }
+    } else if (test_domain_decomposition) {
+      for (int n = 0; n < Nsrc; n++) {
+        auto deviation = std::pow(10, -(double)(ColorSpinorField::Compare(spinorRef[n], spinorOut[n])));
+        printfQuda("Deviation for (D-PDP)_{%d,%d}*spinor is %e\n", dd_col % 2, dd_col / 2, deviation);
+      }
     } else {
       for (int n = 0; n < Nsrc; n++) {
         auto norm_cpu = blas::norm2(spinorRef[n]);
diff --git a/tests/staggered_dslash_ctest.cpp b/tests/staggered_dslash_ctest.cpp
index cc3f1de021..f38e3646e4 100644
--- a/tests/staggered_dslash_ctest.cpp
+++ b/tests/staggered_dslash_ctest.cpp
@@ -3,6 +3,7 @@
 using namespace quda;
 
 bool ctest_all_partitions = false;
+bool ctest_domain_decomposition = false;
 
 using ::testing::Bool;
 using ::testing::Combine;
@@ -10,10 +11,10 @@ using ::testing::Range;
 using ::testing::TestWithParam;
 using ::testing::Values;
 
-class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, int>>
+class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int, int, int, int, int>>
 {
 protected:
-  ::testing::tuple<int, int, int> param;
+  ::testing::tuple<int, int, int, int, int> param;
 
   bool skip()
   {
@@ -31,6 +32,10 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
     if (!ctest_all_partitions && !partition_enabled[::testing::get<2>(GetParam())]) return true;
 
     if (::testing::get<2>(GetParam()) > 0 && dslash_test_wrapper.test_split_grid) { return true; }
+
+    if (::testing::get<3>(GetParam()) == 0 && ::testing::get<4>(GetParam()) > 0) return true;
+    if (!ctest_domain_decomposition && ::testing::get<3>(GetParam()) > 0) return true;
+
     return false;
   }
 
@@ -42,6 +47,11 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
     printfQuda("prec recon   test_type     dagger   S_dim         T_dimension\n");
     printfQuda("%s   %s       %s           %d       %d/%d/%d        %d \n", get_prec_str(prec),
                get_recon_str(link_recon), get_string(dtest_type_map, dtest_type).c_str(), dagger, xdim, ydim, zdim, tdim);
+    if (dslash_test_wrapper.test_domain_decomposition) {
+      if (dd_red_black)
+        printfQuda("Testing DD Red Black with block: %d  %d  %d  %d\n", dd_block_size[0], dd_block_size[1],
+                   dd_block_size[2], dd_block_size[3]);
+    }
   }
 
 public:
@@ -58,7 +68,10 @@ class StaggeredDslashTest : public ::testing::TestWithParam<::testing::tuple<int
     }
     updateR();
 
-    dslash_test_wrapper.init_ctest(prec, recon);
+    int dd_value = ::testing::get<3>(GetParam());
+    int dd_color = ::testing::get<4>(GetParam());
+
+    dslash_test_wrapper.init_ctest(prec, recon, dd_value, dd_color);
     display_test_info(prec, recon);
   }
 
@@ -108,6 +121,7 @@ int main(int argc, char **argv)
   auto app = make_app();
   app->add_option("--test", dtest_type, "Test method")->transform(CLI::CheckedTransformer(dtest_type_map));
   app->add_option("--all-partitions", ctest_all_partitions, "Test all instead of reduced combination of partitions");
+  app->add_option("--domain-decomposition", ctest_domain_decomposition, "Test domain decomposition");
   add_comms_option_group(app);
   try {
     app->parse(argc, argv);
@@ -155,29 +169,55 @@ int main(int argc, char **argv)
   return test_rc;
 }
 
-std::string getstaggereddslashtestname(testing::TestParamInfo<::testing::tuple<int, int, int>> param)
+std::string getstaggereddslashtestname(testing::TestParamInfo<::testing::tuple<int, int, int, int, int>> param)
 {
   const int prec = ::testing::get<0>(param.param);
   const int recon = ::testing::get<1>(param.param);
   const int part = ::testing::get<2>(param.param);
+  const int dd = ::testing::get<3>(param.param);
+  const int col = ::testing::get<4>(param.param);
   std::stringstream ss;
   // ss << get_dslash_str(dslash_type) << "_";
   ss << get_prec_str(getPrecision(prec));
   ss << "_r" << recon;
   ss << "_partition" << part;
+  if (dd > 0) {
+    switch (dd) {
+    case 1: ss << "_dd_local"; break;
+    case 2: ss << "_dd_global"; break;
+    }
+    switch (col) {
+    case 0: ss << "_red_red"; break;
+    case 1: ss << "_black_red"; break;
+    case 2: ss << "_red_black"; break;
+    case 3: ss << "_black_black"; break;
+    }
+  } else if (col > 0) {
+    ss << "_skipped" << col;
+  }
   return ss.str();
 }
 
 #ifdef MULTI_GPU
-INSTANTIATE_TEST_SUITE_P(QUDA, StaggeredDslashTest,
-                         Combine(Range(0, 4),
-                                 ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8),
-                                 Range(0, 16)),
-                         getstaggereddslashtestname);
+#define N_PARTITIONS 16
+#else
+#define N_PARTITIONS 1
+#endif
+
+#if QUDA_DOMAIN_DECOMPOSITION > 0
+#define N_DD_TESTS 3
+#define N_DD_COLS 4
 #else
+#define N_DD_TESTS 1
+#define N_DD_COLS 1
+#endif
+
 INSTANTIATE_TEST_SUITE_P(QUDA, StaggeredDslashTest,
                          Combine(Range(0, 4),
                                  ::testing::Values(QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8),
-                                 ::testing::Values(0)),
+                                 Range(0, N_PARTITIONS), Range(0, N_DD_TESTS), Range(0, N_DD_COLS)),
                          getstaggereddslashtestname);
-#endif
+
+#undef N_PARTITIONS
+#undef N_DD_TESTS
+#undef N_DD_COLS
diff --git a/tests/staggered_dslash_test_utils.h b/tests/staggered_dslash_test_utils.h
index 1b5609bfda..d7f5024da0 100644
--- a/tests/staggered_dslash_test_utils.h
+++ b/tests/staggered_dslash_test_utils.h
@@ -58,6 +58,7 @@ struct StaggeredDslashTestWrapper {
 
   std::vector<ColorSpinorField> cudaSpinor;
   std::vector<ColorSpinorField> cudaSpinorOut;
+  std::vector<ColorSpinorField> cudaSpinorTmp;
 
   static inline std::vector<ColorSpinorField> vp_spinor;
   static inline std::vector<ColorSpinorField> vp_spinor_out;
@@ -76,8 +77,11 @@ struct StaggeredDslashTestWrapper {
 
   // Split grid options
   static inline bool test_split_grid = false;
+  static inline bool test_domain_decomposition = false;
   int num_src = 1;
 
+  static inline int dd_col = 0;
+
   void staggeredDslashRef()
   {
     // compare to dslash reference implementation
@@ -101,7 +105,7 @@ struct StaggeredDslashTestWrapper {
     }
   }
 
-  void init_ctest(int precision, QudaReconstructType link_recon_)
+  void init_ctest(int precision, QudaReconstructType link_recon_, int dd_value, int dd_color)
   {
     gauge_param = newQudaGaugeParam();
     inv_param = newQudaInvertParam();
@@ -124,6 +128,8 @@ struct StaggeredDslashTestWrapper {
       init_host();
       first_time = false;
     }
+
+    init_domain_decomposition(dd_value, dd_color);
     init();
   }
 
@@ -219,6 +225,36 @@ struct StaggeredDslashTestWrapper {
     setVerbosity(verbosity);
   }
 
+  void init_domain_decomposition(int value, int color)
+  {
+    if (value == 0) {
+      test_domain_decomposition = false;
+      return;
+    }
+    test_domain_decomposition = true;
+    dd_col = color;
+
+    if (value < 3) {
+      dd_red_black = true;
+
+      // dd_block_size is half of the local lattice
+      if (value == 1) {
+        for (auto i = 0u; i < 4; i++) dd_block_size[i] = gauge_param.X[i] / 2;
+        return;
+      }
+
+      // dd_block_size is half of the global lattice
+      if (value == 2) {
+        for (auto i = 0u; i < 4; i++) dd_block_size[i] = (gauge_param.X[i] * comm_dim(i)) / 2;
+        return;
+      }
+
+    } else {
+      dd_red_black = false;
+    }
+    errorQuda("Unexpected value for domain decomposition (%d)", value);
+  }
+
   void init()
   {
 
@@ -241,7 +277,7 @@ struct StaggeredDslashTestWrapper {
     GaugeFieldParam cpuLongParam(gauge_param, qdp_longlink);
     cpuLongParam.order = QUDA_QDP_GAUGE_ORDER;
     cpuLongParam.ghostExchange = QUDA_GHOST_EXCHANGE_PAD;
-    cpuLong = GaugeField(cpuLongParam);
+    if (dslash_type == QUDA_ASQTAD_DSLASH) cpuLong = GaugeField(cpuLongParam);
 
     // Override link reconstruct as appropriate for staggered or asqtad
     if (is_staggered(dslash_type)) {
@@ -261,13 +297,10 @@ struct StaggeredDslashTestWrapper {
     csParam.setPrecision(inv_param.cuda_prec);
     csParam.location = QUDA_CUDA_FIELD_LOCATION;
 
-    cudaSpinor.resize(Nsrc);
-    cudaSpinorOut.resize(Nsrc);
-    for (auto i = 0; i < Nsrc; i++) {
-      cudaSpinor[i] = ColorSpinorField(csParam);
-      cudaSpinorOut[i] = ColorSpinorField(csParam);
-      cudaSpinor[i] = spinor[i];
-    }
+    resize(cudaSpinor, Nsrc, csParam);
+    resize(cudaSpinorOut, Nsrc, csParam);
+    if (test_domain_decomposition) { resize(cudaSpinorTmp, Nsrc, csParam); }
+    cudaSpinor = spinor;
 
     bool pc = (dtest_type == dslash_test_type::MatPC); // For test_type 0, can use either pc or not pc
     // because both call the same "Dslash" directly.
@@ -337,6 +370,65 @@ struct StaggeredDslashTestWrapper {
       }
       dslashMultiSrcQuda(_hp_x.data(), _hp_b.data(), &inv_param, parity);
 
+    } else if (test_domain_decomposition) {
+
+      vector_ref<ColorSpinorField> spinor = cudaSpinor;
+      vector_ref<ColorSpinorField> out = cudaSpinorOut;
+      vector_ref<ColorSpinorField> tmp = cudaSpinorTmp;
+
+      if (dd_red_black) {
+        for (int n = 0; n < Nsrc; n++) {
+          for (auto i = 0u; i < 4; i++) {
+            cudaSpinor[n].DD().block_dim[i] = dd_block_size[i];
+            cudaSpinorOut[n].DD().block_dim[i] = dd_block_size[i];
+            cudaSpinorTmp[n].DD().block_dim[i] = dd_block_size[i];
+          }
+        }
+
+        blas::zero(cudaSpinorOut);
+        blas::zero(cudaSpinorTmp);
+
+        spinor.DD(DD::reset, DD::red_black_type, dd_col % 2 == 0 ? DD::red_active : DD::black_active);
+        out.DD(DD::reset, DD::red_black_type, dd_col / 2 == 0 ? DD::red_active : DD::black_active);
+
+        switch (dtest_type) {
+        case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinor, parity); break;
+        case dslash_test_type::MatPC:
+        case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinor); break;
+        case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinor); break;
+        default:
+          errorQuda("Test type %s not support for current Dslash", get_string(dtest_type_map, dtest_type).c_str());
+        }
+
+        spinor.DD(DD::reset);
+        out.DD(DD::reset);
+        spinorOut = cudaSpinorOut;
+
+        // We also test that Dyx is same as D applied to projected in and out spinors
+        blas::copy(tmp, cudaSpinor);
+        tmp.DD(DD::reset, DD::red_black_type, dd_col % 2 == 0 ? DD::red_active : DD::black_active);
+        tmp.projectDD();
+        tmp.DD(DD::reset);
+
+        switch (dtest_type) {
+        case dslash_test_type::Dslash: dirac->Dslash(cudaSpinorOut, cudaSpinorTmp, parity); break;
+        case dslash_test_type::MatPC:
+        case dslash_test_type::Mat: dirac->M(cudaSpinorOut, cudaSpinorTmp); break;
+        case dslash_test_type::MatDagMat: dirac->MdagM(cudaSpinorOut, cudaSpinorTmp); break;
+        default:
+          errorQuda("Test type %s not support for current Dslash", get_string(dtest_type_map, dtest_type).c_str());
+        }
+
+        out.DD(DD::reset, DD::red_black_type, dd_col / 2 == 0 ? DD::red_active : DD::black_active);
+        out.projectDD();
+        out.DD(DD::reset);
+
+        spinorRef = cudaSpinorOut;
+
+      } else {
+        errorQuda("Test dd type not supported");
+      }
+
     } else {
 
       for (int i = 0; i < niter; i++) {
@@ -445,6 +537,13 @@ struct StaggeredDslashTestWrapper {
         deviation = std::max(deviation, pow(10.0, -(double)(ColorSpinorField::Compare(spinorRef[0], vp_spinor_out[n]))));
         if (failed) { deviation = 1.0; }
       }
+    } else if (test_domain_decomposition) {
+      for (int n = 0; n < Nsrc; n++) {
+        auto deviation = std::pow(10, -(double)(ColorSpinorField::Compare(spinorRef[n], spinorOut[n])));
+        printfQuda("Deviation for (D-PDP)_{%d,%d}*spinor is %e\n", dd_col % 2, dd_col / 2, deviation);
+        double tol = getTolerance(cuda_prec);
+        EXPECT_LE(deviation, tol) << "Projected Dirac and project spinors do not agree";
+      }
     } else {
       for (int i = 0; i < Nsrc; i++) {
         auto spinor_ref_norm = blas::norm2(spinorRef[i]);
diff --git a/tests/staggered_eigensolve_test.cpp b/tests/staggered_eigensolve_test.cpp
index 95ebff044b..5d32989866 100644
--- a/tests/staggered_eigensolve_test.cpp
+++ b/tests/staggered_eigensolve_test.cpp
@@ -108,7 +108,7 @@ void init()
   cpuFatMILC = GaugeField(cpuParam);
 
   cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS;
-  cpuParam.nFace = 3;
+  cpuParam.nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1;
   cpuParam.order = QUDA_QDP_GAUGE_ORDER;
   cpuLongQDP = GaugeField(cpuParam);
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
@@ -121,7 +121,7 @@ void init()
 
   // Reorder gauge fields to MILC order
   cpuFatMILC = cpuFatQDP;
-  cpuLongMILC = cpuLongQDP;
+  if (dslash_type == QUDA_ASQTAD_DSLASH) cpuLongMILC = cpuLongQDP;
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the
@@ -142,10 +142,11 @@ void init()
 
   // now copy back to QDP aliases, since these are used for the reference dslash
   cpuFatQDP = cpuFatMILC;
-  cpuLongQDP = cpuLongMILC;
-  // ensure QDP alias has exchanged ghosts
   cpuFatQDP.exchangeGhost();
-  cpuLongQDP.exchangeGhost();
+  if (dslash_type == QUDA_ASQTAD_DSLASH) {
+    cpuLongQDP = cpuLongMILC;
+    cpuLongQDP.exchangeGhost();
+  }
 
   // Staggered Gauge construct END
   //-----------------------------------------------------------------------------------
diff --git a/tests/staggered_invert_test.cpp b/tests/staggered_invert_test.cpp
index 00b0023e93..fe900fb1aa 100644
--- a/tests/staggered_invert_test.cpp
+++ b/tests/staggered_invert_test.cpp
@@ -203,7 +203,7 @@ void init()
   cpuFatMILC = GaugeField(cpuParam);
 
   cpuParam.link_type = QUDA_ASQTAD_LONG_LINKS;
-  cpuParam.nFace = 3;
+  cpuParam.nFace = dslash_type == QUDA_ASQTAD_DSLASH ? 3 : 1;
   cpuParam.order = QUDA_QDP_GAUGE_ORDER;
   cpuLongQDP = GaugeField(cpuParam);
   cpuParam.order = QUDA_MILC_GAUGE_ORDER;
@@ -216,7 +216,7 @@ void init()
 
   // Reorder gauge fields to MILC order
   cpuFatMILC = cpuFatQDP;
-  cpuLongMILC = cpuLongQDP;
+  if (dslash_type == QUDA_ASQTAD_DSLASH) cpuLongMILC = cpuLongQDP;
 
   // Compute plaquette. Routine is aware that the gauge fields already have the phases on them.
   // This needs to be called before `loadFatLongGaugeQuda` because this routine also loads the
@@ -237,10 +237,11 @@ void init()
 
   // now copy back to QDP aliases, since these are used for the reference dslash
   cpuFatQDP = cpuFatMILC;
-  cpuLongQDP = cpuLongMILC;
-  // ensure QDP alias has exchanged ghosts
   cpuFatQDP.exchangeGhost();
-  cpuLongQDP.exchangeGhost();
+  if (dslash_type == QUDA_ASQTAD_DSLASH) {
+    cpuLongQDP = cpuLongMILC;
+    cpuLongQDP.exchangeGhost();
+  }
 
   // Staggered Gauge construct END
   //-----------------------------------------------------------------------------------
diff --git a/tests/utils/command_line_params.cpp b/tests/utils/command_line_params.cpp
index 6b27d64876..86b8939bc8 100644
--- a/tests/utils/command_line_params.cpp
+++ b/tests/utils/command_line_params.cpp
@@ -78,6 +78,10 @@ QudaInverterType precon_type = QUDA_INVALID_INVERTER;
 QudaSchwarzType precon_schwarz_type = QUDA_INVALID_SCHWARZ;
 QudaAcceleratorType precon_accelerator_type = QUDA_INVALID_ACCELERATOR;
 
+std::array<int, 4> dd_block_size = {4, 4, 4, 4};
+bool dd_red_black = false;
+bool dd_test_projection = false;
+
 double madwf_diagonal_suppressor = 0.0;
 int madwf_ls = 4;
 int madwf_null_miniter = niter;
@@ -1115,7 +1119,7 @@ void add_multigrid_option_group(std::shared_ptr<QUDAApp> quda_app)
 
 void add_eofa_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-  auto opgroup = quda_app->add_option_group("EOFA", "Options controlling EOFA parameteres");
+  auto opgroup = quda_app->add_option_group("EOFA", "Options controlling EOFA parameters");
 
   CLI::TransformPairs<int> eofa_pm_map {{"plus", 1}, {"minus", 0}};
   opgroup->add_option("--eofa-pm", eofa_pm, "Set to evalute \"plus\" or \"minus\" EOFA operator (default plus)")
@@ -1126,6 +1130,17 @@ void add_eofa_option_group(std::shared_ptr<QUDAApp> quda_app)
   opgroup->add_option("--eofa-mq3", eofa_mq1, "Set mq3 for EOFA operator (default 1.0)");
 }
 
+void add_dd_option_group(std::shared_ptr<QUDAApp> quda_app)
+{
+  auto opgroup = quda_app->add_option_group("DD", "Options controlling Domain Decomposition parameters");
+  opgroup
+    ->add_option("--dd-block-size", dd_block_size,
+                 "Set the domain decomposition block size in all four dimension (default 4 4 4 4)")
+    ->expected(4);
+  opgroup->add_option("--dd-red-black", dd_red_black, "Enable red-black domain decomposition (default false)");
+  opgroup->add_option("--dd-test-projection", dd_red_black, "Compare against the projected result (default false)");
+}
+
 void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
   // Option group for SU(3) related options
@@ -1166,7 +1181,7 @@ void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app)
 
 void add_madwf_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-  auto opgroup = quda_app->add_option_group("MADWF", "Options controlling MADWF parameteres");
+  auto opgroup = quda_app->add_option_group("MADWF", "Options controlling MADWF parameters");
 
   opgroup->add_option("--madwf-diagonal-suppressor", madwf_diagonal_suppressor,
                       "Set the digonal suppressor for MADWF (default 0)");
@@ -1300,8 +1315,7 @@ void add_gaugefix_option_group(std::shared_ptr<QUDAApp> quda_app)
 
 void add_comms_option_group(std::shared_ptr<QUDAApp> quda_app)
 {
-  auto opgroup
-    = quda_app->add_option_group("Communication", "Options controlling communication (split grid) parameteres");
+  auto opgroup = quda_app->add_option_group("Communication", "Options controlling communication (split grid) parameters");
   opgroup->add_option("--grid-partition", grid_partition, "Set the grid partition (default 1 1 1 1)")->expected(4);
 }
 
diff --git a/tests/utils/command_line_params.h b/tests/utils/command_line_params.h
index 6df4521c2b..96bd047386 100644
--- a/tests/utils/command_line_params.h
+++ b/tests/utils/command_line_params.h
@@ -252,6 +252,7 @@ void add_eigen_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_deflation_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_multigrid_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_eofa_option_group(std::shared_ptr<QUDAApp> quda_app);
+void add_dd_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_madwf_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_su3_option_group(std::shared_ptr<QUDAApp> quda_app);
 void add_heatbath_option_group(std::shared_ptr<QUDAApp> quda_app);
@@ -338,6 +339,10 @@ extern QudaInverterType precon_type;
 extern QudaSchwarzType precon_schwarz_type;
 extern QudaAcceleratorType precon_accelerator_type;
 
+extern std::array<int, 4> dd_block_size;
+extern bool dd_red_black;
+extern bool dd_test_projection;
+
 extern double madwf_diagonal_suppressor;
 extern int madwf_ls;
 extern int madwf_null_miniter;