diff --git a/Makefile.in b/Makefile.in
index 8b54c70cd..7aaca2e0d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -58,7 +58,7 @@ MODULES = read_input gamma measure_gauge_action start \
 	little_D block operator \
 	spinor_fft X_psi P_M_eta \
 	jacobi fatal_error invert_clover_eo gettime \
-	tm_debug_printf \
+	tm_debug_printf compare_derivative \
         @SPI_FILES@ @QUDA_INTERFACE@ @DDalphaAMG_INTERFACE@
 
 CXXMODULES = @QPHIX_INTERFACE@
diff --git a/compare_derivative.c b/compare_derivative.c
new file mode 100644
index 000000000..ab30bc0b7
--- /dev/null
+++ b/compare_derivative.c
@@ -0,0 +1,79 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2024 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP 
+# include <omp.h>
+#endif
+#include <stdio.h>
+#include "global.h"
+#include "monomial/monomial.h"
+
+/* this function compares two derivatives calculated by an external library and tmLQCD */
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, 
+    const double threshold, const char * name){
+  int n_diff = 0;
+
+  for(int ix = 0; ix < VOLUME; ix++){
+    for(int mu=0; mu<4; mu++){
+      double *ext=&(ext_lib[ix][mu].d1);
+      double *nat=&(native[ix][mu].d1);
+      for(int j=0; j<8; ++j){
+        double diff=ext[j]-nat[j];
+        if (sqrt(diff*diff) > threshold || isnan( ext[j] ) || isinf(ext[j]) ){
+            n_diff++;
+            printf("derivative at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d,"
+                   " ext: %-14e, native: %-14e ratio: %-14g diff %-14g  on proc_id %d\n", 
+                  g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
+                  ext[j], nat[j], ext[j]/nat[j], ext[j]-nat[j], g_proc_id);
+        }
+      }
+    }
+  }
+  if(n_diff > 0){
+    printf("%s: the deviation between tmLQCD and the external library "
+           "exceeds the threshold %.1e in %d case(s) for parameters: c0=%e c1=%e g_beta=%e on proc_id: %d\n",
+           name,
+           threshold,
+           n_diff,
+           mnl->c0,
+           mnl->c1,
+           mnl->beta,
+           g_proc_id);
+
+    if(g_strict_residual_check) fatal_error("Difference between external library and tmLQCD-native function!", 
+                                            name);
+  }
+
+  int red_n_diff = 0;
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Reduce(&n_diff, &red_n_diff, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+#else
+  red_n_diff = n_diff;
+#endif
+  if(g_proc_id == 0){
+    printf("The maximum number of deviations in %s exceeding the threshold %.1e was %d\n",
+           name, threshold, red_n_diff);
+  }
+}
+
diff --git a/compare_derivative.h b/compare_derivative.h
new file mode 100644
index 000000000..82fd3b1a7
--- /dev/null
+++ b/compare_derivative.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2024 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef COMPARE_DERIVATIVE_H
+#define COMPARE_DERIVATIVE_H
+
+#include "monomial/monomial.h"
+#include "su3adj.h"
+
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, const double threshold, const char * name);
+
+#endif
diff --git a/configure.in b/configure.in
index da1ea9046..5aaf15dfd 100644
--- a/configure.in
+++ b/configure.in
@@ -964,6 +964,16 @@ if test $enable_quda_experimental = yes; then
 else
   AC_MSG_RESULT(no)
 fi
+AC_MSG_CHECKING(whether the QUDA force is enabled)
+AC_ARG_ENABLE(quda_fermionic_forces,
+  AS_HELP_STRING([--enable-quda_fermionic_forces], [enable support for fermionic forces using QUDA [default=yes]]),
+  enable_quda_fermionic_forces=$enableval, enable_quda_fermionic_forces=yes)
+if test $enable_quda_fermionic_forces = no; then
+  AC_MSG_RESULT(no)
+else
+  AC_MSG_RESULT(yes)
+  AC_DEFINE(TM_QUDA_FERMIONIC_FORCES,1, fermionic forces with QUDA are enabled)
+fi
 
 # QPhiX library for Intel Xeon and Xeon Phis
 AC_MSG_CHECKING(whether we want to use QPhiX)
diff --git a/doc/input.tex b/doc/input.tex
index 780e1afee..85610ed36 100644
--- a/doc/input.tex
+++ b/doc/input.tex
@@ -425,17 +425,10 @@ \subsection{Input parameter for main program}
 Each of them has different options
 :
 \begin{itemize}
-\item {\ttfamily DET, CLOVERDET}:
-  \begin{itemize}
-  \item {\ttfamily 2KappaMu}
-  \end{itemize}
-\item {\ttfamily CLOVERDET}:
-  \begin{itemize}
-  \item {\ttfamily csw}
-  \end{itemize}
 \item {\ttfamily DET, CLOVERDET}:
   \begin{itemize}
   \item {\ttfamily Kappa}
+  \item {\ttfamily 2KappaMu}
   \item {\ttfamily Timescale}: the timescale on which to integrate
     this monomial. Counting starts from zero up to the total number of
     timescales minus 1.
@@ -454,8 +447,17 @@ \subsection{Input parameter for main program}
   \item {\ttfamily HB\_Solver}: the solver to be used in the heatbath step, see section \ref{sec:hb.solver} for details.
   \item {\ttfamily Name}: a name to be assigned to the monomial. The
     default is {\ttfamily DET}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 %
+\item {\ttfamily CLOVERDET}:
+\begin{itemize}
+	\item {\ttfamily csw}
+	\item {\ttfamily UseExternalLibrary} 
+	Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+\end{itemize}
+%
 \item {\ttfamily DETRATIO}: the same as for {\ttfamily DET}, but in
   addition:
   \begin{itemize}
@@ -464,10 +466,13 @@ \subsection{Input parameter for main program}
   \item {\ttfamily Name}: 
     a name to be assigned to the monomial. 
     The default is {\ttfamily DETRATIO}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 %
 \item {\ttfamily CLOVERDETRATIO}: 
   see {\ttfamily CLOVERDET} and {\ttfamily DETRATIO}.
+ 
 %
 \item {\ttfamily GAUGE}: 
   \begin{itemize}
@@ -490,6 +495,8 @@ \subsection{Input parameter for main program}
   \item {\ttfamily RectangleCoefficient}: the value of the parameter
     $c_1$. The coefficient $c_0$ is computed from $c_0 = 1-8c_1$. Is
     effective only for {\ttfamily type = user}.
+   \item {\ttfamily UseExternalLibrary} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
   There is maximally one instance allowed of this type.
   
diff --git a/doc/quda.tex b/doc/quda.tex
index 1eab42b36..281be3d1e 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -13,7 +13,9 @@ \subsubsection{Design goals of the interface}
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
 	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
-\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. In the HMC, the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials.
+\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. 
+    Within the monomial declarations of the input file (between {\ttfamily BeginMonomial} and {\ttfamily EndMonomial}) the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials in the HMC.
+    Further, the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial with support currently limited to {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO}.
 	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
@@ -68,6 +70,22 @@ \subsubsection{Installation}
 \end{verbatim}
 Note that a {\ttfamily C++} compiler is required for linking against the QUDA library, therefore set {\ttfamily CXX} appropriately. {\ttfamily \${QUDADIR}} is where you installed QUDA in the previous step and {\ttfamily \${CUDADIR}} is required again for linking.
 
+\subsubsection{QUDA versions}
+
+If you need a version of QUDA after https://github.com/lattice/quda/commit/50864ffde1bd8f46fd4a2a2b2e6d44a5a588e2c2 you nee to configure with 
+\begin{verbatim}
+  --enable-quda_experimental=yes
+\end{verbatim}
+
+If you need a version of QUDA before \url{https://github.com/lattice/quda/commit/fd50676db06fc36efb3a791a3059c57cca70bb55} you need to add in the configuration script the option
+\begin{verbatim}
+  --enable-quda_fermionic_forces=no
+\end{verbatim}
+so that the wrapper to the QUDA fermionic forces is not compiled,
+thus if \texttt{--enable-quda_fermionic_forces=no} setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} monomials
+is not supported and tmLQCD will stop with an error.
+
+
 \subsubsection{Usage}
 Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseExternalInverter} flag to {\ttfamily quda}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as
 \begin{verbatim}
diff --git a/global.h b/global.h
index 09009b971..ab1c9c171 100644
--- a/global.h
+++ b/global.h
@@ -194,7 +194,7 @@ EXTERN su3_32 ** g_gauge_field_copy_32;
 
 EXTERN su3adj ** moment;
 EXTERN su3adj ** df0;
-EXTERN su3adj ** ddummy;
+EXTERN su3adj ** ddummy, ** debug_derivative;
 
 EXTERN int count00,count01,count10,count11,count20,count21;
 EXTERN double g_kappa, g_c_sw, g_beta;
diff --git a/include/tmlqcd_config_internal.h.in b/include/tmlqcd_config_internal.h.in
index a90b18c55..1a187466a 100644
--- a/include/tmlqcd_config_internal.h.in
+++ b/include/tmlqcd_config_internal.h.in
@@ -203,6 +203,9 @@
 /* Using experimental QUDA version */
 #undef TM_QUDA_EXPERIMENTAL
 
+/* Using QUDA fermionic forces */
+#undef TM_QUDA_FERMIONIC_FORCES
+
 /* Using DDalphaAMG */
 #undef DDalphaAMG
 
diff --git a/init/init_moment_field.c b/init/init_moment_field.c
index f0db1a156..e7a000e02 100644
--- a/init/init_moment_field.c
+++ b/init/init_moment_field.c
@@ -28,7 +28,7 @@
 #include "su3adj.h"
 #include "sse.h"
 
-su3adj * mo=NULL, *df=NULL, *du=NULL;
+su3adj * mo=NULL, *df=NULL, *du=NULL, *du_internal=NULL;
 
 int init_moment_field(const int V, const int VR) {
   int i = 0;
@@ -94,6 +94,27 @@ int init_moment_field(const int V, const int VR) {
     ddummy[i] = ddummy[i-1]+4;
   }
 
+  if(g_debug_level>3){
+    if((void*)(du_internal = (su3adj*)calloc(4*VR+1, sizeof(su3adj))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(5);
+    }
+    if((void*)(debug_derivative = (su3adj**)calloc(VR,sizeof(su3adj*))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(6);
+    }
+#if ( defined SSE || defined SSE2 || defined SSE3)
+    debug_derivative[0] = (su3adj*)(((unsigned long int)(du_internal)+ALIGN_BASE)&~ALIGN_BASE);
+#else
+    debug_derivative[0] = du_internal;
+#endif
+    
+    for(i = 1; i < VR; i++){
+      debug_derivative[i] = debug_derivative[i-1]+4;
+    }
+  }
   return(0);
 }
 
diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index e9863b29e..b98a38e3c 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
+#include <string.h>
 #include "global.h"
 #include "su3.h"
 #include "su3adj.h"
@@ -49,6 +50,11 @@
 #include "operator/clovertm_operators.h"
 #include "operator/clovertm_operators_32.h"
 #include "cloverdet_monomial.h"
+#include "xchange/xchange_deri.h"
+#include "compare_derivative.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
 
 /* think about chronological solver ! */
 
@@ -103,54 +109,101 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
                       mnl->csg_N, &mnl->csg_n, N);
   
-  // Y_o -> w_fields[0]
-  tm_stopwatch_push(&g_timers, "Qm", "");
-  mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
-  if(mnl->even_odd_flag) {
-    // apply Hopping Matrix M_{eo}
-    // to get the even sites of X_e
-    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
-    tm_stopwatch_pop(&g_timers, 0, 1, "");
-    // \delta Q sandwitched by Y_o^\dagger and X_e
-    deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
-    
-    // to get the even sites of Y_e
-    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
+  
+
+  if ( mnl->external_library == QUDA_LIB){
+    if(!mnl->even_odd_flag) {
+      fatal_error("QUDA support only even_odd_flag",__func__);
+    }
+#ifdef TM_USE_QUDA
+    if (g_debug_level > 3) {
+#ifdef TM_USE_MPI
+      // FIXME: here we do not need to set to zero the interior but only the halo
+#ifdef TM_USE_OMP
+      # pragma omp parallel for
+#endif
+      for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
+        for(int mu=0;mu<4;mu++) { 
+          _zero_su3adj(debug_derivative[i][mu]);
+        }
+      }
+#endif // end setting to zero the halo when using MPI
+      // we copy only the interior
+      memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
+    }
+
+    compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1], NULL, 0);
+
+    if (g_debug_level > 3){
+      su3adj **given = hf->derivative;
+      hf->derivative = debug_derivative;
+      mnl->external_library = NO_EXT_LIB;
+      tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
+      cloverdet_derivative(id, hf);
+      #ifdef TM_USE_MPI
+        xchange_deri(hf->derivative);// this function use ddummy inside
+      #endif
+      compare_derivative(mnl, given, hf->derivative, 1e-9, "cloverdet_derivative");
+      mnl->external_library = QUDA_LIB;
+      hf->derivative = given;
+    }
+#else
+      fatal_error("in %s  external_library == QUDA_LIB requires TM_USE_QUDA to be true",__func__);
+#endif // end ifdef TM_USE_QUDA
+  }
+  else{
+    // Y_o -> w_fields[0]
+    tm_stopwatch_push(&g_timers, "Qm", "");
+    mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
     tm_stopwatch_pop(&g_timers, 0, 1, "");
-    // \delta Q sandwitched by Y_e^\dagger and X_o
-    // uses the gauge field in hf and changes the derivative fields in hf
-    deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor);
+    // print_spinor(mnl->w_fields[0], 0 , 1);
+
+    if(mnl->even_odd_flag) {
+      // apply Hopping Matrix M_{eo}
+      // to get the even sites of X_e
+      tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+      H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
+      tm_stopwatch_pop(&g_timers, 0, 1, "");
+      // \delta Q sandwitched by Y_o^\dagger and X_e
+      deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
+      
+      // to get the even sites of Y_e
+      tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+      H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
+      tm_stopwatch_pop(&g_timers, 0, 1, "");
+      // \delta Q sandwitched by Y_e^\dagger and X_o
+      // uses the gauge field in hf and changes the derivative fields in hf
+      deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor);
+      
+      // here comes the clover term...
+      // computes the insertion matrices for S_eff
+      // result is written to swp and swm
+      // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
+      sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+      
+      // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
+      sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
     
-    // here comes the clover term...
-    // computes the insertion matrices for S_eff
-    // result is written to swp and swm
-    // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
-    sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+      // compute the contribution for the det-part
+      // we again compute only the insertion matrices for S_det
+      // the result is added to swp and swm
+      // even sites only!
+      sw_deriv(EE, mnl->mu);
+    }
+    else {
+      /* \delta Q sandwitched by Y^\dagger and X */
+      deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
+
+      sw_spinor(mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+    }
     
-    // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-    sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
-  
-    // compute the contribution for the det-part
-    // we again compute only the insertion matrices for S_det
-    // the result is added to swp and swm
-    // even sites only!
-    sw_deriv(EE, mnl->mu);
-  }
-  else {
-    /* \delta Q sandwitched by Y^\dagger and X */
-    deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
+    // now we compute
+    // finally, using the insertion matrices stored in swm and swp
+    // we compute the terms F^{det} and F^{sw} at once
+    // uses the gaugefields in hf and changes the derivative field in hf
+    sw_all(hf, mnl->kappa, mnl->c_sw);
 
-    sw_spinor(mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
   }
-  
-  // now we compute
-  // finally, using the insertion matrices stored in swm and swp
-  // we compute the terms F^{det} and F^{sw} at once
-  // uses the gaugefields in hf and changes the derivative field in hf
-  sw_all(hf, mnl->kappa, mnl->c_sw);
 
   mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
@@ -250,7 +303,7 @@ double cloverdet_acc(const int id, hamiltonian_field_t * const hf) {
   /* Compute the energy contr. from first field */
   tm_stopwatch_push(&g_timers, "energy1_square_norm", "");
   mnl->energy1 = square_norm(mnl->w_fields[0], N, 1);
-  tm_stopwatch_pop(&g_timers, 0, 1, ""); 
+  tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   if(g_proc_id == 0) {
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 4dea03408..ccaf91d5a 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -26,6 +26,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
+#include <string.h>
 #include "global.h"
 #include "su3.h"
 #include "start.h"
@@ -45,6 +46,12 @@
 #include "monomial/monomial.h"
 #include "boundary.h"
 #include "cloverdetratio_monomial.h"
+#include "xchange/xchange_deri.h"
+#include "monomial/gauge_monomial.h"
+#include "compare_derivative.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
 
 /* think about chronological solver ! */
 
@@ -214,39 +221,81 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
 		       mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
-  // Apply Q_{-} to get Y_W -> w_fields[0]
-  tm_stopwatch_push(&g_timers, "Qm_diff", "");
-  mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
-  // Compute phi - Y_W -> w_fields[0]
-  diff(mnl->w_fields[0], mnl->w_fields[0], mnl->pf, VOLUME/2);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
 
-  /* apply Hopping Matrix M_{eo} */
-  /* to get the even sites of X */
-  tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-  H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EE, -1, mnl->mu);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
-  /* \delta Q sandwitched by Y_o^\dagger and X_e */
-  deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
-  
-  /* to get the even sites of Y */
-  tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-  H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EE, +1, mnl->mu);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
-  /* \delta Q sandwitched by Y_e^\dagger and X_o */
-  deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); 
+  if ( mnl->external_library == QUDA_LIB){
+    if(!mnl->even_odd_flag) {
+      fatal_error("QUDA support only even_odd_flag",__func__);
+    }
+#ifdef TM_USE_QUDA
+    if (g_debug_level > 3) {
+#ifdef TM_USE_MPI
+      // FIXME: here we do not need to set to zero the interior but only the halo
+#ifdef TM_USE_OMP
+      # pragma omp parallel for
+#endif // end setting to zero the halo when using MPI
+      for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
+        for(int mu=0;mu<4;mu++) { 
+          _zero_su3adj(debug_derivative[i][mu]);
+        }
+      }
+#endif
+      // we copy only the interior
+      memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
+    }
 
-  // here comes the clover term...
-  // computes the insertion matrices for S_eff
-  // result is written to swp and swm
-  // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
-  sw_spinor_eo(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
-  
-  // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-  sw_spinor_eo(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+    compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1], mnl->pf, 1 );
 
-  sw_all(hf, mnl->kappa, mnl->c_sw);
+    if (g_debug_level > 3){
+      su3adj **given = hf->derivative;
+      hf->derivative = debug_derivative;
+      mnl->external_library = NO_EXT_LIB;
+      tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
+      cloverdetratio_derivative(no, hf);
+      #ifdef TM_USE_MPI
+        xchange_deri(hf->derivative);// this function use ddummy inside
+      #endif
+      compare_derivative(mnl, given, hf->derivative, 1e-9, "cloverdetratio_derivative");
+      mnl->external_library = QUDA_LIB;
+      hf->derivative = given;
+    }
+#else
+      fatal_error("in %s  external_library == QUDA_LIB requires TM_USE_QUDA to be true",__func__);
+#endif // end ifdef TM_USE_QUDA
+  }
+  else{
+    // Apply Q_{-} to get Y_W -> w_fields[0]
+    tm_stopwatch_push(&g_timers, "Qm_diff", "");
+    mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
+    // Compute phi - Y_W -> w_fields[0]
+    diff(mnl->w_fields[0], mnl->w_fields[0], mnl->pf, VOLUME/2);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
 
+    /* apply Hopping Matrix M_{eo} */
+    /* to get the even sites of X */
+    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+    /* \delta Q sandwitched by Y_o^\dagger and X_e */
+    deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
+    
+    /* to get the even sites of Y */
+    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+    /* \delta Q sandwitched by Y_e^\dagger and X_o */
+    deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); 
+
+    // here comes the clover term...
+    // computes the insertion matrices for S_eff
+    // result is written to swp and swm
+    // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
+    sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+    
+    // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
+    sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+
+    sw_all(hf, mnl->kappa, mnl->c_sw);
+  }
   mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
diff --git a/monomial/gauge_monomial.c b/monomial/gauge_monomial.c
index 7a22047d1..59f6455dd 100644
--- a/monomial/gauge_monomial.c
+++ b/monomial/gauge_monomial.c
@@ -44,49 +44,12 @@
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
 #include "gauge_monomial.h"
+#include "compare_derivative.h"
 #include "fatal_error.h"
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
 
-/* this function compares the gauge derivative calculated by an external library and tmLQCD */
-void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native ){
-  const double threshold = 1e-7;
-  int n_diff = 0;
-
-  for(int ix = 0; ix < VOLUME; ix++){
-    for(int mu=0; mu<4; mu++){
-      double *ext=&(ext_lib[ix][mu].d1);
-      double *nat=&(native[ix][mu].d1);
-      for(int j=0; j<8; ++j){
-        double diff=(ext[j]-nat[j])/nat[j];
-        if (sqrt(diff*diff) > threshold){
-          n_diff++;
-          printf("gauge derivative relative deviation %e at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d on proc_id %d, ext: %e, native: %e\n", 
-                 diff,
-                 g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
-                 g_proc_id,
-                 ext[j], nat[j]);
-        }
-      }
-    }
-  }
-  if(n_diff > 0){
-    printf("gauge_derivative: the relative deviation between tmLQCD and the external library "
-           "exceeds the threshold %.1e in %d case(s) for parameters: c0=%e c1=%e g_beta=%e on proc_id: %d\n",
-           threshold,
-           n_diff,
-           mnl->c0,
-           mnl->c1,
-           mnl->beta,
-           g_proc_id);
-
-    if(g_strict_residual_check) fatal_error("Difference between external library and tmLQCD-native function!", 
-                                            "gauge_derivative");
-  }
-}
-
-
 /* this function calculates the derivative of the momenta: equation 13 of Gottlieb */
 void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
@@ -107,7 +70,7 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
      hf->derivative=ddummy;
      mnl->external_library=NO_EXT_LIB;
      gauge_derivative(id, hf);
-     compare_derivative(mnl,given, ddummy);
+     compare_derivative(mnl, given, ddummy, 1e-9, "gauge_derivative");
      mnl->external_library=QUDA_LIB;
      hf->derivative=given;
     }
diff --git a/monomial/monomial.c b/monomial/monomial.c
index b7e8a55c5..2227e3897 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -248,6 +248,12 @@ int init_monomials(const int V, const int even_odd_flag) {
           monomial_list[i].Qm = &Qsw_full_minus_psi;
         }
         init_swpm(VOLUME);
+        if(monomial_list[i].external_library==QUDA_LIB){
+          if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
+            tm_debug_printf(0,0,"Error: CLOVERDET monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            exit(1);
+          }
+        }
         clover_monomials[no_clover_monomials] = i;
         no_clover_monomials++;
         if(g_proc_id == 0 && g_debug_level > 1) {
@@ -271,6 +277,12 @@ int init_monomials(const int V, const int even_odd_flag) {
         monomial_list[i].Qp = &Qsw_plus_psi;
         monomial_list[i].Qm = &Qsw_minus_psi;
         init_swpm(VOLUME);
+        if(monomial_list[i].external_library==QUDA_LIB){
+          if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
+            tm_debug_printf(0,0,"Error: CLOVERDETRATIO monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            exit(1);
+          }
+        }
         if(g_proc_id == 0 && g_debug_level > 1) {
           printf("# Initialised monomial %s of type CLOVERDETRATIO, no_monomials= %d\n",
               monomial_list[i].name,
diff --git a/quda_interface.c b/quda_interface.c
index bdf143886..9789c1b7a 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -159,7 +159,9 @@ QudaEigParam eig_param;
 // pointer to the QUDA gaugefield
 double *gauge_quda[4];
 
-QudaGaugeParam  f_gauge_param;
+
+// re-initialized each time by _initMomQuda()
+QudaGaugeParam  f_gauge_param; 
 // pointer to the QUDA momentum field
 double *mom_quda[4];
 double *mom_quda_reordered[4];
@@ -2104,11 +2106,12 @@ int invert_eo_degenerate_quda(spinor * const out,
 
   int iterations = 0;
 
+  void *spinorIn  = (void*)in; // source
+  void *spinorOut = (void*)out; // solution
+
   // it returns if quda is already init
   _initQuda();
 
-  void *spinorIn  = (void*)in; // source
-  void *spinorOut = (void*)out; // solution
 
   if ( rel_prec )
     inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
@@ -2203,6 +2206,10 @@ int invert_eo_degenerate_quda(spinor * const out,
     invertQuda(spinorOut, spinorOut, &inv_param);
     tm_stopwatch_pop(&g_timers, 0, 0, "TM_QUDA");
     reorder_spinor_eo_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, 1);
+    inv_param.mu = -inv_param.mu;
+#ifdef TM_QUDA_EXPERIMENTAL
+    inv_param.tm_rho = -inv_param.tm_rho;
+#endif
   } else {
     reorder_spinor_eo_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, 1);
   }
@@ -2534,6 +2541,56 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
 
+#ifdef TM_QUDA_FERMIONIC_FORCES
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor * const X_o, spinor * const phi, int detratio) {
+  tm_stopwatch_push(&g_timers, __func__, "");
+  
+  _initQuda();
+  _initMomQuda();
+  void *spinorIn;
+  void *spinorPhi;
+  
+  spinorIn  = (void*)X_o;
+  reorder_spinor_eo_toQuda((double*)spinorIn, inv_param.cpu_prec, 0, 1);
+  if (detratio){
+    spinorPhi  = (void*)phi;
+    reorder_spinor_eo_toQuda((double*)spinorPhi, inv_param.cpu_prec, 0, 1);
+  }
+
+  _loadGaugeQuda(mnl->solver_params.compression_type);
+  _loadCloverQuda(&inv_param);
+  tm_stopwatch_push(&g_timers, "computeTMCloverForceQuda", ""); 
+  
+  const int nvector = 1; // number of rational approximants
+  double coeff[1] = {4.*mnl->kappa*mnl->kappa}; // minus because in p(QUDA)=-Y (tmLQCD) , the factor 4 is experimentally observed
+  
+  inv_param.kappa= mnl->kappa;
+  inv_param.clover_csw= mnl->c_sw;
+  inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
+  // when using QUDA MG the following parameter need to be set 
+  inv_param.matpc_type = QUDA_MATPC_ODD_ODD_ASYMMETRIC;
+  inv_param.dagger = QUDA_DAG_YES;
+  computeTMCloverForceQuda(mom_quda, &spinorIn, &spinorPhi, coeff,  nvector, &f_gauge_param,   &inv_param, detratio);
+
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+  
+  reorder_mom_fromQuda(mom_quda);
+  add_mom_to_derivative(hf->derivative);
+
+  // we always need to restore the source
+  if (detratio){
+    reorder_spinor_eo_fromQuda((double*)spinorPhi, inv_param.cpu_prec, 0, 1);
+  }
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+}
+#else 
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor * const X_o, spinor * const phi, int detratio) {
+  tm_debug_printf(0,0,"Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with --enable-quda_fermionic=yes\n");
+  exit(1);
+}
+#endif
+
+
 void  compute_WFlow_quda(const double eps, const double tmax, const int traj, FILE* outfile){
   tm_stopwatch_push(&g_timers, __func__, "");
   
diff --git a/quda_interface.h b/quda_interface.h
index 05fc5444a..b79457089 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -146,6 +146,7 @@ void M_quda(spinor * const P, spinor * const Q);
 
 
 // to be called instead of tmLQCD functions to use the QUDA inverter in solve_degenerate
+// NOTE: the global struct inv_param is initialized inside this function
 int invert_eo_degenerate_quda(spinor * const Odd_new,
                               spinor * const Odd,
                               const double precision, const int max_iter,
@@ -172,6 +173,8 @@ int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out
                                      CompressionType compression);
 
 void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf);
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf,
+                                     spinor * const X_o, spinor * const phi, int ratio);
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 
diff --git a/read_input.l b/read_input.l
index 4bebf2790..3b39524dc 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2334,6 +2334,14 @@ static inline double fltlist_next_token(int * const list_end){
     if(myverbose) printf("  Use QPhiX inverter line %d monomial %d\n", line_of_file, current_monomial);
     mnl->solver_params.external_inverter = NO_EXT_INV;
   }
+  {SPC}*UseExternalLibrary{EQL}no {
+    mnl->external_library = NO_EXT_LIB;
+    if(myverbose) printf("  UseExternalLibrary set to false in line %d, monomial %d\n", line_of_file, current_monomial);
+  }
+  {SPC}*UseExternalLibrary{EQL}quda {
+    mnl->external_library = QUDA_LIB;
+    if(myverbose) printf("  UseExternalLibrary set to quda in line %d, monomial %d\n", line_of_file, current_monomial);
+  }
   {SPC}*UseSloppyPrecision{EQL}yes {
     if(myverbose) printf("  Use sloppy precision (single) in the inverter (if supported) line %d monomial %d\n", line_of_file, current_monomial);
     mnl->solver_params.sloppy_precision = SLOPPY_SINGLE;