From 018547596701471c1cec0f36905fa2d3ef249f85 Mon Sep 17 00:00:00 2001
From: simone-romiti <simone.romiti.1994@gmail.com>
Date: Thu, 6 Oct 2022 16:58:19 +0200
Subject: [PATCH 01/34] first steps: building separate function for quda
 initialization

---
 quda_interface.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 36c7974ed..f605afbc5 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2092,11 +2092,12 @@ int invert_eo_degenerate_quda(spinor * const out,
 
   int iterations = 0;
 
+  void *spinorIn  = (void*)in; // source
+  void *spinorOut = (void*)out; // solution
+
   // it returns if quda is already init
   _initQuda();
 
-  void *spinorIn  = (void*)in; // source
-  void *spinorOut = (void*)out; // solution
 
   if ( rel_prec )
     inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;

From c318eb46b3988ffbf9b97a819b6049f4efe9888d Mon Sep 17 00:00:00 2001
From: simone-romiti <simone.romiti.1994@gmail.com>
Date: Thu, 6 Oct 2022 18:54:41 +0200
Subject: [PATCH 02/34] 1st draft quda_cloverdet_derivative()

---
 monomial/cloverdet_monomial.c | 105 +++++++++++++++++++++-------------
 quda_interface.c              |  79 ++++++++++++++++++++++++-
 quda_interface.h              |   2 +
 3 files changed, 145 insertions(+), 41 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index e9863b29e..03c56e6a4 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -107,50 +107,75 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   tm_stopwatch_push(&g_timers, "Qm", "");
   mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
-  if(mnl->even_odd_flag) {
-    // apply Hopping Matrix M_{eo}
-    // to get the even sites of X_e
-    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
-    tm_stopwatch_pop(&g_timers, 0, 1, "");
-    // \delta Q sandwitched by Y_o^\dagger and X_e
-    deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
-    
-    // to get the even sites of Y_e
-    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
-    tm_stopwatch_pop(&g_timers, 0, 1, "");
-    // \delta Q sandwitched by Y_e^\dagger and X_o
-    // uses the gauge field in hf and changes the derivative fields in hf
-    deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor);
+
+  if ( mnl->solver_params.external_inverter == QUDA_INVERTER){
+  #ifdef TM_USE_QUDA
+
+    if (g_debug_level > 3) {
+     memcpy(ddummy[0], hf->derivative[0], (4*VOLUMEPLUSRAND+1)*sizeof(su3adj));
+    }
+
+    compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1]);
+
+    if (g_debug_level > 3){
+      su3adj **given=hf->derivative;
+      hf->derivative=ddummy;
+      mnl->solver_params.external_inverter = NO_EXT_INV;
+      cloverdet_derivative(id, hf);
+      compare_derivative(mnl,given, ddummy);
+      mnl->solver_params.external_inverter = QUDA_INVERTER;
+      hf->derivative=given;
+    }
+
+  #endif // no other option, TM_USE_QUDA already checked by solver
+  }
+  else{
+    if(mnl->even_odd_flag) {
+      // apply Hopping Matrix M_{eo}
+      // to get the even sites of X_e
+      tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+      H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
+      tm_stopwatch_pop(&g_timers, 0, 1, "");
+      // \delta Q sandwitched by Y_o^\dagger and X_e
+      deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
+      
+      // to get the even sites of Y_e
+      tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+      H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
+      tm_stopwatch_pop(&g_timers, 0, 1, "");
+      // \delta Q sandwitched by Y_e^\dagger and X_o
+      // uses the gauge field in hf and changes the derivative fields in hf
+      deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor);
+      
+      // here comes the clover term...
+      // computes the insertion matrices for S_eff
+      // result is written to swp and swm
+      // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
+      sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+      
+      // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
+      sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
     
-    // here comes the clover term...
-    // computes the insertion matrices for S_eff
-    // result is written to swp and swm
-    // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e
-    sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+      // compute the contribution for the det-part
+      // we again compute only the insertion matrices for S_det
+      // the result is added to swp and swm
+      // even sites only!
+      sw_deriv(EE, mnl->mu);
+    }
+    else {
+      /* \delta Q sandwitched by Y^\dagger and X */
+      deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
+
+      sw_spinor(mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+    }
     
-    // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-    sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
-  
-    // compute the contribution for the det-part
-    // we again compute only the insertion matrices for S_det
-    // the result is added to swp and swm
-    // even sites only!
-    sw_deriv(EE, mnl->mu);
-  }
-  else {
-    /* \delta Q sandwitched by Y^\dagger and X */
-    deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor);
+    // now we compute
+    // finally, using the insertion matrices stored in swm and swp
+    // we compute the terms F^{det} and F^{sw} at once
+    // uses the gaugefields in hf and changes the derivative field in hf
+    sw_all(hf, mnl->kappa, mnl->c_sw);
 
-    sw_spinor(mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
   }
-  
-  // now we compute
-  // finally, using the insertion matrices stored in swm and swp
-  // we compute the terms F^{det} and F^{sw} at once
-  // uses the gaugefields in hf and changes the derivative field in hf
-  sw_all(hf, mnl->kappa, mnl->c_sw);
 
   mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
diff --git a/quda_interface.c b/quda_interface.c
index f605afbc5..385ed767c 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -151,7 +151,9 @@ tm_QudaParams_t quda_input;
 // pointer to the QUDA gaugefield
 double *gauge_quda[4];
 
-QudaGaugeParam  f_gauge_param;
+
+// re-initialized each time by _initMomQuda()
+QudaGaugeParam  f_gauge_param; 
 // pointer to the QUDA momentum field
 double *mom_quda[4];
 double *mom_quda_reordered[4];
@@ -2521,6 +2523,81 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
 
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const X_o) {
+  tm_stopwatch_push(&g_timers, __func__, "");
+  
+  _initQuda();
+  _initMomQuda();
+
+  // const int rect = mnl->use_rectangles;
+
+//  int * path_length = rect ? plaq_rect_length : plaq_length;
+
+  // const int num_paths = rect ? 24 : 6;
+  // const int max_length = rect ? 5 : 3;
+
+  // // can't use stack memory for this so we need to copy to a heap buffer
+  // int *** path_buf = malloc(4*(num_paths+1)*sizeof(void*));
+  // for(int i=0; i<4; i++) {
+  //   path_buf[i] = (int**) path_buf+4+i*num_paths;
+  //   for(int j=0; j<num_paths; j++) {
+  //     if(rect) 
+  //       path_buf[i][j] = plaq_rect_path[i][j];
+  //     else
+  //       path_buf[i][j] = plaq_path[i][j];
+  //   }
+  // }
+
+  // double * loop_coeff = malloc(num_paths*sizeof(double));
+  // for(int i=0; i<num_paths; i++) {
+  //   // Plaq coeffs
+  //   if(i<6)
+  //     loop_coeff[i] = -0.66666666666 * g_beta * mnl->c0; //2/3 conversion factor to match tmLQCD results
+  //   // Rect coeffs
+  //   else
+  //     loop_coeff[i] = -0.66666666666 * g_beta * mnl->c1;
+  // }
+  
+  #pragma omp parallel for
+  for(int i = 0; i < 4; i++){
+    memset(mom_quda[i], 0, VOLUME*10*sizeof(double));
+  }
+
+  reorder_gauge_toQuda(hf->gaugefield, NO_COMPRESSION);
+  // the reordering above overwrites gauge_quda
+  // to make sure that things become synchronised again at the
+  // next _loadGaugeQuda, we reset the QUDA gauge state here
+  reset_quda_gauge_state(&quda_gauge_state);
+  
+  tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", ""); 
+  // computeGaugeForceQuda((void*)mom_quda, 
+  //                       (void*)gauge_quda, 
+  //                       path_buf, path_length, loop_coeff, num_paths, max_length, 1.0,
+  //                       &f_gauge_param);
+  void ** foo1 = NULL; // not used by quda
+  void * foo2 = NULL; // not used by quda
+  const int nvector = 1; // number of rational approximants
+  double coeff[1] = {1.0};  
+  double kappa2_quda = - mnl->kappa*mnl->kappa; // -kappa*kappa
+  double ck_quda = - mnl->c_sw * mnl->kappa / 8.0;
+  const double multiplicity = 1.0; 
+  computeCloverForceQuda(mom_quda, /*dt=*/1.0, X_o, foo1, coeff, kappa2_quda, ck_quda,
+                            nvector, multiplicity, foo2, &f_gauge_param,
+                            &inv_param);
+
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+
+  // free(path_buf);
+  // free(loop_coeff);
+  
+  reorder_mom_fromQuda(mom_quda);
+  add_mom_to_derivative(hf->derivative);
+
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+}
+
+
+
 void  compute_WFlow_quda(const double eps, const double tmax, const int traj, FILE* outfile){
   tm_stopwatch_push(&g_timers, __func__, "");
   
diff --git a/quda_interface.h b/quda_interface.h
index d555544d0..ee454c66c 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -146,6 +146,7 @@ void M_quda(spinor * const P, spinor * const Q);
 
 
 // to be called instead of tmLQCD functions to use the QUDA inverter in solve_degenerate
+// NOTE: the global struct inv_param is initialized inside this function
 int invert_eo_degenerate_quda(spinor * const Odd_new,
                               spinor * const Odd,
                               const double precision, const int max_iter,
@@ -172,6 +173,7 @@ int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out
                                      CompressionType compression);
 
 void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf);
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const X_o);
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 #endif /* QUDA_INTERFACE_H_ */

From 1abcfb12e4894bc8af188b7091d2051aa78ed26c Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Wed, 12 Oct 2022 18:29:36 +0200
Subject: [PATCH 03/34] reorder_spinor_eo_toQuda

---
 quda_interface.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 385ed767c..ab500ca0d 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2528,6 +2528,21 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   
   _initQuda();
   _initMomQuda();
+  void *spinorIn;
+  const int nr_sf_in = 1;
+  spinor ** in_o;
+
+  if (g_debug_level > 3){ // here we need to copy the initial vector to compare with the CPU
+    const int Vh = VOLUME/2;
+    init_solver_field(&in_o, Vh, nr_sf_in);
+    memcpy(in_o[0],      X_o, Vh*24*sizeof(double));
+    spinorIn  = (void*)in_o[0];
+  }
+  else{
+    spinorIn  = (void*)X_o;
+  }
+
+  reorder_spinor_eo_toQuda((double*)spinorIn, inv_param.cpu_prec, 0, 1);
 
   // const int rect = mnl->use_rectangles;
 
@@ -2581,12 +2596,14 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   double kappa2_quda = - mnl->kappa*mnl->kappa; // -kappa*kappa
   double ck_quda = - mnl->c_sw * mnl->kappa / 8.0;
   const double multiplicity = 1.0; 
-  computeCloverForceQuda(mom_quda, /*dt=*/1.0, X_o, foo1, coeff, kappa2_quda, ck_quda,
+  computeCloverForceQuda(mom_quda, /*dt=*/1.0, spinorIn, foo1, coeff, kappa2_quda, ck_quda,
                             nvector, multiplicity, foo2, &f_gauge_param,
                             &inv_param);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-
+  if (g_debug_level > 3){
+    finalize_solver(in_o, nr_sf_in);
+  }
   // free(path_buf);
   // free(loop_coeff);
   

From 1eea6fda40f88768a337f7493673656e73fb4efa Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 18 Oct 2022 18:30:02 +0200
Subject: [PATCH 04/34] third argument of computeCloverForceQuda

---
 quda_interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quda_interface.c b/quda_interface.c
index ab500ca0d..ff5761c15 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2596,7 +2596,7 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   double kappa2_quda = - mnl->kappa*mnl->kappa; // -kappa*kappa
   double ck_quda = - mnl->c_sw * mnl->kappa / 8.0;
   const double multiplicity = 1.0; 
-  computeCloverForceQuda(mom_quda, /*dt=*/1.0, spinorIn, foo1, coeff, kappa2_quda, ck_quda,
+  computeCloverForceQuda(mom_quda, /*dt=*/1.0, &spinorIn, foo1, coeff, kappa2_quda, ck_quda,
                             nvector, multiplicity, foo2, &f_gauge_param,
                             &inv_param);
 

From b86e617c4c76e1d6d5ee8ab25b5d4927f161b9ec Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 25 Nov 2022 16:40:52 +0100
Subject: [PATCH 05/34] using computeTMCloverForceQuda

---
 quda_interface.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index ff5761c15..9ba8c77a1 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2596,9 +2596,11 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   double kappa2_quda = - mnl->kappa*mnl->kappa; // -kappa*kappa
   double ck_quda = - mnl->c_sw * mnl->kappa / 8.0;
   const double multiplicity = 1.0; 
-  computeCloverForceQuda(mom_quda, /*dt=*/1.0, &spinorIn, foo1, coeff, kappa2_quda, ck_quda,
-                            nvector, multiplicity, foo2, &f_gauge_param,
-                            &inv_param);
+  // computeCloverForceQuda(mom_quda, /*dt=*/1.0, &spinorIn, foo1, coeff, kappa2_quda, ck_quda,
+  //                           nvector, multiplicity, foo2, &f_gauge_param,
+  //                           &inv_param);
+  computeTMCloverForceQuda(mom_quda, &spinorIn, coeff,  nvector, 
+    foo2/* void *h_gauge */, &f_gauge_param,   &inv_param);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
   if (g_debug_level > 3){

From 2c1eac16772463857e058060edfccbe04678137e Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 25 Nov 2022 16:41:41 +0100
Subject: [PATCH 06/34] set CG as solver for cloverder derivative test

---
 monomial/cloverdet_monomial.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index 03c56e6a4..99b7a5dac 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -118,12 +118,15 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
     compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1]);
 
     if (g_debug_level > 3){
-      su3adj **given=hf->derivative;
-      hf->derivative=ddummy;
+      su3adj **given = hf->derivative;
+      hf->derivative = ddummy;
+      int store_solver = mnl->solver;
       mnl->solver_params.external_inverter = NO_EXT_INV;
+      mnl->solver = CG;
       cloverdet_derivative(id, hf);
       compare_derivative(mnl,given, ddummy);
       mnl->solver_params.external_inverter = QUDA_INVERTER;
+      mnl->solver = store_solver;
       hf->derivative=given;
     }
 

From cbaa9d4d2be2241657d3ad690558e5d75d9a8865 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 20 Dec 2022 16:49:03 +0100
Subject: [PATCH 07/34] passing the correct gauge filed

---
 quda_interface.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 9ba8c77a1..b7bb83386 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2532,16 +2532,16 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   const int nr_sf_in = 1;
   spinor ** in_o;
 
-  if (g_debug_level > 3){ // here we need to copy the initial vector to compare with the CPU
-    const int Vh = VOLUME/2;
-    init_solver_field(&in_o, Vh, nr_sf_in);
-    memcpy(in_o[0],      X_o, Vh*24*sizeof(double));
-    spinorIn  = (void*)in_o[0];
-  }
-  else{
-    spinorIn  = (void*)X_o;
-  }
-
+  // if (g_debug_level > 3){ // here we need to copy the initial vector to compare with the CPU
+  //   const int Vh = VOLUME/2;
+  //   init_solver_field(&in_o, Vh, nr_sf_in);
+  //   memcpy(in_o[0],      X_o, Vh*24*sizeof(double));
+  //   spinorIn  = (void*)in_o[0];
+  // }
+  // else{
+    
+  // }
+  spinorIn  = (void*)X_o;
   reorder_spinor_eo_toQuda((double*)spinorIn, inv_param.cpu_prec, 0, 1);
 
   // const int rect = mnl->use_rectangles;
@@ -2578,12 +2578,13 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
     memset(mom_quda[i], 0, VOLUME*10*sizeof(double));
   }
 
-  reorder_gauge_toQuda(hf->gaugefield, NO_COMPRESSION);
-  // the reordering above overwrites gauge_quda
-  // to make sure that things become synchronised again at the
-  // next _loadGaugeQuda, we reset the QUDA gauge state here
-  reset_quda_gauge_state(&quda_gauge_state);
-  
+  // reorder_gauge_toQuda(hf->gaugefield, NO_COMPRESSION);
+  // // the reordering above overwrites gauge_quda
+  // // to make sure that things become synchronised again at the
+  // // next _loadGaugeQuda, we reset the QUDA gauge state here
+  // reset_quda_gauge_state(&quda_gauge_state);
+  // _loadGaugeQuda(NO_COMPRESSION);
+
   tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", ""); 
   // computeGaugeForceQuda((void*)mom_quda, 
   //                       (void*)gauge_quda, 
@@ -2603,9 +2604,9 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
     foo2/* void *h_gauge */, &f_gauge_param,   &inv_param);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-  if (g_debug_level > 3){
-    finalize_solver(in_o, nr_sf_in);
-  }
+  // if (g_debug_level > 3){
+  //   finalize_solver(in_o, nr_sf_in);
+  // }
   // free(path_buf);
   // free(loop_coeff);
   

From 71cac33026531b7068187391f0e83e13aafd49f8 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 20 Dec 2022 16:49:31 +0100
Subject: [PATCH 08/34] check for nan or inf

---
 monomial/gauge_monomial.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monomial/gauge_monomial.c b/monomial/gauge_monomial.c
index 7a22047d1..1c1270482 100644
--- a/monomial/gauge_monomial.c
+++ b/monomial/gauge_monomial.c
@@ -60,7 +60,7 @@ void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native ){
       double *nat=&(native[ix][mu].d1);
       for(int j=0; j<8; ++j){
         double diff=(ext[j]-nat[j])/nat[j];
-        if (sqrt(diff*diff) > threshold){
+        if (sqrt(diff*diff) > threshold || isnan( ext[j] ) || isinf(ext[j]) ){
           n_diff++;
           printf("gauge derivative relative deviation %e at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d on proc_id %d, ext: %e, native: %e\n", 
                  diff,

From a240e8bb8256789178dbae35032b805419d6729d Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 26 Jan 2023 17:31:57 +0100
Subject: [PATCH 09/34] xchange_deri before comparing to quda

---
 global.h                      |  2 +-
 init/init_moment_field.c      | 23 ++++++++++++++++++++++-
 monomial/cloverdet_monomial.c | 31 ++++++++++++++++++++++---------
 monomial/gauge_monomial.c     | 14 +++++++-------
 quda_interface.c              |  6 +++---
 5 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/global.h b/global.h
index 09009b971..ab1c9c171 100644
--- a/global.h
+++ b/global.h
@@ -194,7 +194,7 @@ EXTERN su3_32 ** g_gauge_field_copy_32;
 
 EXTERN su3adj ** moment;
 EXTERN su3adj ** df0;
-EXTERN su3adj ** ddummy;
+EXTERN su3adj ** ddummy, ** debug_derivative;
 
 EXTERN int count00,count01,count10,count11,count20,count21;
 EXTERN double g_kappa, g_c_sw, g_beta;
diff --git a/init/init_moment_field.c b/init/init_moment_field.c
index f0db1a156..e7a000e02 100644
--- a/init/init_moment_field.c
+++ b/init/init_moment_field.c
@@ -28,7 +28,7 @@
 #include "su3adj.h"
 #include "sse.h"
 
-su3adj * mo=NULL, *df=NULL, *du=NULL;
+su3adj * mo=NULL, *df=NULL, *du=NULL, *du_internal=NULL;
 
 int init_moment_field(const int V, const int VR) {
   int i = 0;
@@ -94,6 +94,27 @@ int init_moment_field(const int V, const int VR) {
     ddummy[i] = ddummy[i-1]+4;
   }
 
+  if(g_debug_level>3){
+    if((void*)(du_internal = (su3adj*)calloc(4*VR+1, sizeof(su3adj))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(5);
+    }
+    if((void*)(debug_derivative = (su3adj**)calloc(VR,sizeof(su3adj*))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(6);
+    }
+#if ( defined SSE || defined SSE2 || defined SSE3)
+    debug_derivative[0] = (su3adj*)(((unsigned long int)(du_internal)+ALIGN_BASE)&~ALIGN_BASE);
+#else
+    debug_derivative[0] = du_internal;
+#endif
+    
+    for(i = 1; i < VR; i++){
+      debug_derivative[i] = debug_derivative[i-1]+4;
+    }
+  }
   return(0);
 }
 
diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index 99b7a5dac..5f1ef0c50 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -103,36 +103,49 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
                       mnl->csg_N, &mnl->csg_n, N);
   
-  // Y_o -> w_fields[0]
-  tm_stopwatch_push(&g_timers, "Qm", "");
-  mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
+  
 
   if ( mnl->solver_params.external_inverter == QUDA_INVERTER){
   #ifdef TM_USE_QUDA
 
     if (g_debug_level > 3) {
-     memcpy(ddummy[0], hf->derivative[0], (4*VOLUMEPLUSRAND+1)*sizeof(su3adj));
+#ifdef TM_USE_MPI
+      for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
+        for(int mu=0;mu<4;mu++) { 
+          _zero_su3adj(debug_derivative[i][mu]);
+        }
+      }
+#endif
+      memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
     }
 
     compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1]);
 
     if (g_debug_level > 3){
       su3adj **given = hf->derivative;
-      hf->derivative = ddummy;
+      hf->derivative = debug_derivative;
       int store_solver = mnl->solver;
       mnl->solver_params.external_inverter = NO_EXT_INV;
       mnl->solver = CG;
+      tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
       cloverdet_derivative(id, hf);
-      compare_derivative(mnl,given, ddummy);
+      #ifdef TM_USE_MPI
+        xchange_deri(hf->derivative);// this function use ddummy inside
+      #endif
+      compare_derivative(mnl, given, hf->derivative);
       mnl->solver_params.external_inverter = QUDA_INVERTER;
       mnl->solver = store_solver;
-      hf->derivative=given;
+      hf->derivative = given;
     }
-
   #endif // no other option, TM_USE_QUDA already checked by solver
   }
   else{
+    // Y_o -> w_fields[0]
+    tm_stopwatch_push(&g_timers, "Qm", "");
+    mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+    // print_spinor(mnl->w_fields[0], 0 , 1);
+
     if(mnl->even_odd_flag) {
       // apply Hopping Matrix M_{eo}
       // to get the even sites of X_e
diff --git a/monomial/gauge_monomial.c b/monomial/gauge_monomial.c
index 1c1270482..6b45174ab 100644
--- a/monomial/gauge_monomial.c
+++ b/monomial/gauge_monomial.c
@@ -59,14 +59,14 @@ void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native ){
       double *ext=&(ext_lib[ix][mu].d1);
       double *nat=&(native[ix][mu].d1);
       for(int j=0; j<8; ++j){
-        double diff=(ext[j]-nat[j])/nat[j];
+        double diff;
+        if (fabs(nat[j])>1e-7) diff=(ext[j]-nat[j])/nat[j];
+        else diff=ext[j]-nat[j];
         if (sqrt(diff*diff) > threshold || isnan( ext[j] ) || isinf(ext[j]) ){
-          n_diff++;
-          printf("gauge derivative relative deviation %e at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d on proc_id %d, ext: %e, native: %e\n", 
-                 diff,
-                 g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
-                 g_proc_id,
-                 ext[j], nat[j]);
+            n_diff++;
+            printf("derivative at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d, ext: %-14e, native: %-14e ratio: %-14g diff %-14g  on proc_id %d\n", 
+                  g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
+                  ext[j], nat[j], ext[j]/nat[j], ext[j]-nat[j], g_proc_id);
         }
       }
     }
diff --git a/quda_interface.c b/quda_interface.c
index b7bb83386..feaa982c2 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2583,8 +2583,8 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   // // to make sure that things become synchronised again at the
   // // next _loadGaugeQuda, we reset the QUDA gauge state here
   // reset_quda_gauge_state(&quda_gauge_state);
-  // _loadGaugeQuda(NO_COMPRESSION);
-
+  _loadGaugeQuda(NO_COMPRESSION);
+  _loadCloverQuda(&inv_param);
   tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", ""); 
   // computeGaugeForceQuda((void*)mom_quda, 
   //                       (void*)gauge_quda, 
@@ -2593,7 +2593,7 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   void ** foo1 = NULL; // not used by quda
   void * foo2 = NULL; // not used by quda
   const int nvector = 1; // number of rational approximants
-  double coeff[1] = {1.0};  
+  double coeff[1] = {4.*mnl->kappa*mnl->kappa}; // minus because in p(QUDA)=-Y (tmLQCD) , the factor 4 is experimentally observed
   double kappa2_quda = - mnl->kappa*mnl->kappa; // -kappa*kappa
   double ck_quda = - mnl->c_sw * mnl->kappa / 8.0;
   const double multiplicity = 1.0; 

From a30f44ac19d7d1e412c9f0fd2592a1a9ce84876e Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 31 Jan 2023 19:55:38 +0100
Subject: [PATCH 10/34] QUDA only support even odd fermionic forces

---
 monomial/cloverdet_monomial.c |  4 ++-
 quda_interface.c              | 56 ++++-------------------------------
 2 files changed, 8 insertions(+), 52 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index 5f1ef0c50..da25aac3b 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -106,8 +106,10 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   
 
   if ( mnl->solver_params.external_inverter == QUDA_INVERTER){
+    if(!mnl->even_odd_flag) {
+      fatal_error("QUDA support only even_odd_flag",__func__);
+    }
   #ifdef TM_USE_QUDA
-
     if (g_debug_level > 3) {
 #ifdef TM_USE_MPI
       for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
diff --git a/quda_interface.c b/quda_interface.c
index feaa982c2..af8a60464 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2532,64 +2532,21 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   const int nr_sf_in = 1;
   spinor ** in_o;
 
-  // if (g_debug_level > 3){ // here we need to copy the initial vector to compare with the CPU
-  //   const int Vh = VOLUME/2;
-  //   init_solver_field(&in_o, Vh, nr_sf_in);
-  //   memcpy(in_o[0],      X_o, Vh*24*sizeof(double));
-  //   spinorIn  = (void*)in_o[0];
-  // }
-  // else{
-    
-  // }
+  
   spinorIn  = (void*)X_o;
   reorder_spinor_eo_toQuda((double*)spinorIn, inv_param.cpu_prec, 0, 1);
 
-  // const int rect = mnl->use_rectangles;
-
-//  int * path_length = rect ? plaq_rect_length : plaq_length;
-
-  // const int num_paths = rect ? 24 : 6;
-  // const int max_length = rect ? 5 : 3;
-
-  // // can't use stack memory for this so we need to copy to a heap buffer
-  // int *** path_buf = malloc(4*(num_paths+1)*sizeof(void*));
-  // for(int i=0; i<4; i++) {
-  //   path_buf[i] = (int**) path_buf+4+i*num_paths;
-  //   for(int j=0; j<num_paths; j++) {
-  //     if(rect) 
-  //       path_buf[i][j] = plaq_rect_path[i][j];
-  //     else
-  //       path_buf[i][j] = plaq_path[i][j];
-  //   }
-  // }
-
-  // double * loop_coeff = malloc(num_paths*sizeof(double));
-  // for(int i=0; i<num_paths; i++) {
-  //   // Plaq coeffs
-  //   if(i<6)
-  //     loop_coeff[i] = -0.66666666666 * g_beta * mnl->c0; //2/3 conversion factor to match tmLQCD results
-  //   // Rect coeffs
-  //   else
-  //     loop_coeff[i] = -0.66666666666 * g_beta * mnl->c1;
-  // }
   
   #pragma omp parallel for
   for(int i = 0; i < 4; i++){
     memset(mom_quda[i], 0, VOLUME*10*sizeof(double));
   }
 
-  // reorder_gauge_toQuda(hf->gaugefield, NO_COMPRESSION);
-  // // the reordering above overwrites gauge_quda
-  // // to make sure that things become synchronised again at the
-  // // next _loadGaugeQuda, we reset the QUDA gauge state here
-  // reset_quda_gauge_state(&quda_gauge_state);
+
   _loadGaugeQuda(NO_COMPRESSION);
   _loadCloverQuda(&inv_param);
   tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", ""); 
-  // computeGaugeForceQuda((void*)mom_quda, 
-  //                       (void*)gauge_quda, 
-  //                       path_buf, path_length, loop_coeff, num_paths, max_length, 1.0,
-  //                       &f_gauge_param);
+  
   void ** foo1 = NULL; // not used by quda
   void * foo2 = NULL; // not used by quda
   const int nvector = 1; // number of rational approximants
@@ -2600,15 +2557,12 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   // computeCloverForceQuda(mom_quda, /*dt=*/1.0, &spinorIn, foo1, coeff, kappa2_quda, ck_quda,
   //                           nvector, multiplicity, foo2, &f_gauge_param,
   //                           &inv_param);
+  inv_param.kappa= mnl->kappa;
+  inv_param.clover_csw= mnl->c_sw;
   computeTMCloverForceQuda(mom_quda, &spinorIn, coeff,  nvector, 
     foo2/* void *h_gauge */, &f_gauge_param,   &inv_param);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-  // if (g_debug_level > 3){
-  //   finalize_solver(in_o, nr_sf_in);
-  // }
-  // free(path_buf);
-  // free(loop_coeff);
   
   reorder_mom_fromQuda(mom_quda);
   add_mom_to_derivative(hf->derivative);

From a52021fd8d9d47e02e916bce2e190d09774e2856 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Sun, 12 Mar 2023 14:15:05 +0100
Subject: [PATCH 11/34] inconsistent stopwatch labelling

---
 quda_interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quda_interface.c b/quda_interface.c
index af8a60464..0edf33ff5 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2545,7 +2545,7 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
 
   _loadGaugeQuda(NO_COMPRESSION);
   _loadCloverQuda(&inv_param);
-  tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", ""); 
+  tm_stopwatch_push(&g_timers, "computeTMCloverForceQuda", ""); 
   
   void ** foo1 = NULL; // not used by quda
   void * foo2 = NULL; // not used by quda

From b3596eca41703eda61cd14a332995d2908fb00fe Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Wed, 10 May 2023 11:40:58 +0200
Subject: [PATCH 12/34] compression type

---
 quda_interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quda_interface.c b/quda_interface.c
index af8a60464..22e06449c 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2543,7 +2543,7 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   }
 
 
-  _loadGaugeQuda(NO_COMPRESSION);
+  _loadGaugeQuda(mnl->solver_params.compression_type);
   _loadCloverQuda(&inv_param);
   tm_stopwatch_push(&g_timers, "computeGaugeForceQuda", ""); 
   

From 6baaca3c8d7b732856752be8bab96a2271d01767 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Sun, 1 Oct 2023 23:09:40 +0200
Subject: [PATCH 13/34] remove unused param

---
 monomial/cloverdet_monomial.c | 4 +++-
 quda_interface.c              | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index da25aac3b..b64f49319 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -112,12 +112,14 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   #ifdef TM_USE_QUDA
     if (g_debug_level > 3) {
 #ifdef TM_USE_MPI
+      // FIXME: here we do not need to set to zero the interior but only the halo
       for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
         for(int mu=0;mu<4;mu++) { 
           _zero_su3adj(debug_derivative[i][mu]);
         }
       }
 #endif
+      // we copy only the interior
       memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
     }
 
@@ -293,7 +295,7 @@ double cloverdet_acc(const int id, hamiltonian_field_t * const hf) {
   /* Compute the energy contr. from first field */
   tm_stopwatch_push(&g_timers, "energy1_square_norm", "");
   mnl->energy1 = square_norm(mnl->w_fields[0], N, 1);
-  tm_stopwatch_pop(&g_timers, 0, 1, ""); 
+  tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   if(g_proc_id == 0) {
diff --git a/quda_interface.c b/quda_interface.c
index 8b38a534e..f07308f94 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2559,8 +2559,7 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   //                           &inv_param);
   inv_param.kappa= mnl->kappa;
   inv_param.clover_csw= mnl->c_sw;
-  computeTMCloverForceQuda(mom_quda, &spinorIn, coeff,  nvector, 
-    foo2/* void *h_gauge */, &f_gauge_param,   &inv_param);
+  computeTMCloverForceQuda(mom_quda, &spinorIn, coeff,  nvector, &f_gauge_param,   &inv_param);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
   

From 288a3052a862b09493eda06a6d9c752f0b0147b0 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 21 Nov 2023 11:58:43 +0100
Subject: [PATCH 14/34] detratio derivative with quda

---
 monomial/cloverdet_monomial.c      |  2 +-
 monomial/cloverdetratio_monomial.c | 98 +++++++++++++++++++++---------
 quda_interface.c                   | 14 ++++-
 quda_interface.h                   |  3 +-
 4 files changed, 83 insertions(+), 34 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index b64f49319..e25ca7cd5 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -123,7 +123,7 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
       memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
     }
 
-    compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1]);
+    compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1], NULL, 0);
 
     if (g_debug_level > 3){
       su3adj **given = hf->derivative;
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 4dea03408..03d9d60bb 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -214,39 +214,79 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
 		       mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver);
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
-  // Apply Q_{-} to get Y_W -> w_fields[0]
-  tm_stopwatch_push(&g_timers, "Qm_diff", "");
-  mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
-  // Compute phi - Y_W -> w_fields[0]
-  diff(mnl->w_fields[0], mnl->w_fields[0], mnl->pf, VOLUME/2);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
 
-  /* apply Hopping Matrix M_{eo} */
-  /* to get the even sites of X */
-  tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-  H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EE, -1, mnl->mu);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
-  /* \delta Q sandwitched by Y_o^\dagger and X_e */
-  deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
-  
-  /* to get the even sites of Y */
-  tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-  H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EE, +1, mnl->mu);
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
-  /* \delta Q sandwitched by Y_e^\dagger and X_o */
-  deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); 
+    if ( mnl->solver_params.external_inverter == QUDA_INVERTER){
+    if(!mnl->even_odd_flag) {
+      fatal_error("QUDA support only even_odd_flag",__func__);
+    }
+  #ifdef TM_USE_QUDA
+    if (g_debug_level > 3) {
+#ifdef TM_USE_MPI
+      // FIXME: here we do not need to set to zero the interior but only the halo
+      for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
+        for(int mu=0;mu<4;mu++) { 
+          _zero_su3adj(debug_derivative[i][mu]);
+        }
+      }
+#endif
+      // we copy only the interior
+      memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
+    }
 
-  // here comes the clover term...
-  // computes the insertion matrices for S_eff
-  // result is written to swp and swm
-  // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
-  sw_spinor_eo(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
-  
-  // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-  sw_spinor_eo(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+    compute_cloverdet_derivative_quda(mnl, hf, mnl->w_fields[1], mnl->pf, 1 );
 
-  sw_all(hf, mnl->kappa, mnl->c_sw);
+    if (g_debug_level > 3){
+      su3adj **given = hf->derivative;
+      hf->derivative = debug_derivative;
+      int store_solver = mnl->solver;
+      mnl->solver_params.external_inverter = NO_EXT_INV;
+      mnl->solver = CG;
+      tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
+      cloverdetratio_derivative(no, hf);
+      #ifdef TM_USE_MPI
+        xchange_deri(hf->derivative);// this function use ddummy inside
+      #endif
+      compare_derivative(mnl, given, hf->derivative);
+      mnl->solver_params.external_inverter = QUDA_INVERTER;
+      mnl->solver = store_solver;
+      hf->derivative = given;
+    }
+  #endif // no other option, TM_USE_QUDA already checked by solver
+  }
+  else{
+    // Apply Q_{-} to get Y_W -> w_fields[0]
+    tm_stopwatch_push(&g_timers, "Qm_diff", "");
+    mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]);
+    // Compute phi - Y_W -> w_fields[0]
+    diff(mnl->w_fields[0], mnl->w_fields[0], mnl->pf, VOLUME/2);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
 
+    /* apply Hopping Matrix M_{eo} */
+    /* to get the even sites of X */
+    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EE, -1, mnl->mu);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+    /* \delta Q sandwitched by Y_o^\dagger and X_e */
+    deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
+    
+    /* to get the even sites of Y */
+    tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
+    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EE, +1, mnl->mu);
+    tm_stopwatch_pop(&g_timers, 0, 1, "");
+    /* \delta Q sandwitched by Y_e^\dagger and X_o */
+    deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); 
+
+    // here comes the clover term...
+    // computes the insertion matrices for S_eff
+    // result is written to swp and swm
+    // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
+    sw_spinor_eo(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+    
+    // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
+    sw_spinor_eo(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+
+    sw_all(hf, mnl->kappa, mnl->c_sw);
+  }
   mnl_backup_restore_globals(TM_RESTORE_GLOBALS);
   tm_stopwatch_pop(&g_timers, 0, 1, "");
   return;
diff --git a/quda_interface.c b/quda_interface.c
index f07308f94..a563d5dc3 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2523,19 +2523,23 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
 
-void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const X_o) {
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const X_o, spinor ** const phi, int detratio) {
   tm_stopwatch_push(&g_timers, __func__, "");
   
   _initQuda();
   _initMomQuda();
   void *spinorIn;
+  void *spinorPhi;
   const int nr_sf_in = 1;
   spinor ** in_o;
 
   
   spinorIn  = (void*)X_o;
   reorder_spinor_eo_toQuda((double*)spinorIn, inv_param.cpu_prec, 0, 1);
-
+  if (detratio){
+    spinorPhi  = (void*)phi;
+    reorder_spinor_eo_toQuda((double*)spinorPhi, inv_param.cpu_prec, 0, 1);
+  }
   
   #pragma omp parallel for
   for(int i = 0; i < 4; i++){
@@ -2559,13 +2563,17 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   //                           &inv_param);
   inv_param.kappa= mnl->kappa;
   inv_param.clover_csw= mnl->c_sw;
-  computeTMCloverForceQuda(mom_quda, &spinorIn, coeff,  nvector, &f_gauge_param,   &inv_param);
+  computeTMCloverForceQuda(mom_quda, &spinorIn, &spinorPhi, coeff,  nvector, &f_gauge_param,   &inv_param, detratio);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
   
   reorder_mom_fromQuda(mom_quda);
   add_mom_to_derivative(hf->derivative);
 
+  // if we want to compare the force to tmLQCD native implementation we should restore the source
+  if (detratio && g_debug_level > 3){
+    reorder_spinor_eo_fromQuda((double*)spinorPhi, inv_param.cpu_prec, 0, 1);
+  }
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
 
diff --git a/quda_interface.h b/quda_interface.h
index ee454c66c..cf9d37faf 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -173,7 +173,8 @@ int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out
                                      CompressionType compression);
 
 void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf);
-void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const X_o);
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf,
+                                     spinor ** const X_o, spinor ** const phi, int ratio);
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 #endif /* QUDA_INTERFACE_H_ */

From 7b21655cbdcb44aa2e8344e4e19bdb7c82fcb216 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Wed, 22 Nov 2023 15:08:41 +0100
Subject: [PATCH 15/34] restore parameters after using QUDA MG

---
 quda_interface.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/quda_interface.c b/quda_interface.c
index a563d5dc3..027bad208 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2194,6 +2194,10 @@ int invert_eo_degenerate_quda(spinor * const out,
     invertQuda(spinorOut, spinorOut, &inv_param);
     tm_stopwatch_pop(&g_timers, 0, 0, "TM_QUDA");
     reorder_spinor_eo_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, 1);
+    inv_param.mu = -inv_param.mu;
+#ifdef TM_QUDA_EXPERIMENTAL
+    inv_param.tm_rho = -inv_param.tm_rho;
+#endif
   } else {
     reorder_spinor_eo_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, 1);
   }

From c5becee32afad3feace9a8bec93580db0961aac0 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 5 Dec 2023 11:42:40 +0100
Subject: [PATCH 16/34] parity flags more meaningful

---
 monomial/cloverdetratio_monomial.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 03d9d60bb..57d8c2d77 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -264,14 +264,14 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
     /* apply Hopping Matrix M_{eo} */
     /* to get the even sites of X */
     tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EE, -1, mnl->mu);
+    H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu);
     tm_stopwatch_pop(&g_timers, 0, 1, "");
     /* \delta Q sandwitched by Y_o^\dagger and X_e */
     deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); 
     
     /* to get the even sites of Y */
     tm_stopwatch_push(&g_timers, "H_eo_sw_inv_psi", "");
-    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EE, +1, mnl->mu);
+    H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu);
     tm_stopwatch_pop(&g_timers, 0, 1, "");
     /* \delta Q sandwitched by Y_e^\dagger and X_o */
     deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); 
@@ -280,10 +280,10 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
     // computes the insertion matrices for S_eff
     // result is written to swp and swm
     // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e  
-    sw_spinor_eo(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
+    sw_spinor_eo(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor);
     
     // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o
-    sw_spinor_eo(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
+    sw_spinor_eo(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor);
 
     sw_all(hf, mnl->kappa, mnl->c_sw);
   }

From 020dfa3632ec214a11c72905ff15da0b23ceadde Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 7 Dec 2023 18:45:12 +0100
Subject: [PATCH 17/34] fix omp in reorder_mom_fromQuda

---
 quda_interface.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 027bad208..27b006e57 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -671,11 +671,6 @@ void reorder_mom_fromQuda() {
   // mom_quda -> mom_quda_reordered
   tm_stopwatch_push(&g_timers, __func__, "");
 
-#ifdef TM_USE_OMP
-#pragma omp parallel
-  {
-#endif
-
 #ifdef TM_USE_OMP
   #pragma omp parallel for collapse(4)
 #endif
@@ -707,9 +702,6 @@ void reorder_mom_fromQuda() {
 #endif
         }
   
-#ifdef TM_USE_OMP
-  }
-#endif
   tm_stopwatch_pop(&g_timers, 0, 0, "TM_QUDA");
 }
 

From 0df1801f0d2b4e7f6e69ce6113d9413c9e54dd19 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 8 Dec 2023 13:43:28 +0100
Subject: [PATCH 18/34] OMP volume loop

---
 monomial/cloverdet_monomial.c      | 7 +++++--
 monomial/cloverdetratio_monomial.c | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index e25ca7cd5..84186905a 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -109,16 +109,19 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
     if(!mnl->even_odd_flag) {
       fatal_error("QUDA support only even_odd_flag",__func__);
     }
-  #ifdef TM_USE_QUDA
+#ifdef TM_USE_QUDA
     if (g_debug_level > 3) {
 #ifdef TM_USE_MPI
       // FIXME: here we do not need to set to zero the interior but only the halo
+#ifdef TM_USE_OMP
+      # pragma omp parallel for
+#endif
       for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
         for(int mu=0;mu<4;mu++) { 
           _zero_su3adj(debug_derivative[i][mu]);
         }
       }
-#endif
+#endif // end setting to zero the halo when using MPI
       // we copy only the interior
       memcpy(debug_derivative[0], hf->derivative[0], 4*VOLUME*sizeof(su3adj));
     }
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 57d8c2d77..ddd45cb97 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -219,10 +219,13 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
     if(!mnl->even_odd_flag) {
       fatal_error("QUDA support only even_odd_flag",__func__);
     }
-  #ifdef TM_USE_QUDA
+#ifdef TM_USE_QUDA
     if (g_debug_level > 3) {
 #ifdef TM_USE_MPI
       // FIXME: here we do not need to set to zero the interior but only the halo
+#ifdef TM_USE_OMP
+      # pragma omp parallel for
+#endif // end setting to zero the halo when using MPI
       for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { 
         for(int mu=0;mu<4;mu++) { 
           _zero_su3adj(debug_derivative[i][mu]);

From 8f4964f868a645beb2bc3e057e031ff62a337818 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 8 Dec 2023 16:43:33 +0100
Subject: [PATCH 19/34] restore source always

---
 quda_interface.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 27b006e57..8feb4546f 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2566,8 +2566,8 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   reorder_mom_fromQuda(mom_quda);
   add_mom_to_derivative(hf->derivative);
 
-  // if we want to compare the force to tmLQCD native implementation we should restore the source
-  if (detratio && g_debug_level > 3){
+  // we always need to restore the source
+  if (detratio){
     reorder_spinor_eo_fromQuda((double*)spinorPhi, inv_param.cpu_prec, 0, 1);
   }
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");

From 9f7a6d5906fe66faee0d5b5b361f538f45c76386 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 14 Dec 2023 13:25:07 +0100
Subject: [PATCH 20/34] enforcing QUDA_MATPCDAG_MATPC_SOLUTION

---
 quda_interface.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/quda_interface.c b/quda_interface.c
index 8feb4546f..5790cfcff 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2559,6 +2559,7 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   //                           &inv_param);
   inv_param.kappa= mnl->kappa;
   inv_param.clover_csw= mnl->c_sw;
+  inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
   computeTMCloverForceQuda(mom_quda, &spinorIn, &spinorPhi, coeff,  nvector, &f_gauge_param,   &inv_param, detratio);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");

From b6e5a6d8632ae00eb6009b9a3970d8925c69ca4e Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 14 Dec 2023 13:43:01 +0100
Subject: [PATCH 21/34] remove unnecessary reset of mom_quda

---
 quda_interface.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 5790cfcff..cda9f1870 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2536,12 +2536,6 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
     spinorPhi  = (void*)phi;
     reorder_spinor_eo_toQuda((double*)spinorPhi, inv_param.cpu_prec, 0, 1);
   }
-  
-  #pragma omp parallel for
-  for(int i = 0; i < 4; i++){
-    memset(mom_quda[i], 0, VOLUME*10*sizeof(double));
-  }
-
 
   _loadGaugeQuda(mnl->solver_params.compression_type);
   _loadCloverQuda(&inv_param);

From 5c532e9823a144261cc9d18a0df1bd173ef23ccb Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 14 Dec 2023 13:44:51 +0100
Subject: [PATCH 22/34] remove unused variables

---
 quda_interface.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index cda9f1870..50ac4fa25 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2541,16 +2541,9 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   _loadCloverQuda(&inv_param);
   tm_stopwatch_push(&g_timers, "computeTMCloverForceQuda", ""); 
   
-  void ** foo1 = NULL; // not used by quda
-  void * foo2 = NULL; // not used by quda
   const int nvector = 1; // number of rational approximants
   double coeff[1] = {4.*mnl->kappa*mnl->kappa}; // minus because in p(QUDA)=-Y (tmLQCD) , the factor 4 is experimentally observed
-  double kappa2_quda = - mnl->kappa*mnl->kappa; // -kappa*kappa
-  double ck_quda = - mnl->c_sw * mnl->kappa / 8.0;
-  const double multiplicity = 1.0; 
-  // computeCloverForceQuda(mom_quda, /*dt=*/1.0, &spinorIn, foo1, coeff, kappa2_quda, ck_quda,
-  //                           nvector, multiplicity, foo2, &f_gauge_param,
-  //                           &inv_param);
+  
   inv_param.kappa= mnl->kappa;
   inv_param.clover_csw= mnl->c_sw;
   inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;

From fc31c7dbb5f4ec96477146cf58d7a089ae87f8de Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 14 Dec 2023 21:10:38 +0100
Subject: [PATCH 23/34] remove unnecessary reset of mom_quda also in the gauge
 derivative

---
 quda_interface.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 50ac4fa25..d9e26ee11 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2491,11 +2491,6 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
     else
       loop_coeff[i] = -0.66666666666 * g_beta * mnl->c1;
   }
-  
-  #pragma omp parallel for
-  for(int i = 0; i < 4; i++){
-    memset(mom_quda[i], 0, VOLUME*10*sizeof(double));
-  }
 
   reorder_gauge_toQuda(hf->gaugefield, NO_COMPRESSION);
   // the reordering above overwrites gauge_quda

From 07fc55e53c76e51170775382afb3fc44f2711b30 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 19 Dec 2023 15:33:40 +0100
Subject: [PATCH 24/34] fix according to QUDA 0b99a1d

---
 quda_interface.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/quda_interface.c b/quda_interface.c
index d9e26ee11..6b4cd4a7f 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2542,6 +2542,9 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   inv_param.kappa= mnl->kappa;
   inv_param.clover_csw= mnl->c_sw;
   inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
+  // when using QUDA MG the following parameter need to be set 
+  inv_param.matpc_type = QUDA_MATPC_ODD_ODD_ASYMMETRIC;
+  inv_param.dagger = QUDA_DAG_YES;
   computeTMCloverForceQuda(mom_quda, &spinorIn, &spinorPhi, coeff,  nvector, &f_gauge_param,   &inv_param, detratio);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");

From aeffd7a380ffadac4754f716ee786220739f52be Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 16 Jan 2024 15:01:16 +0100
Subject: [PATCH 25/34] add "UseExternalLibrary"  parameter in input to compute
 the derivative with quda

---
 monomial/cloverdet_monomial.c      | 13 ++++++-------
 monomial/cloverdetratio_monomial.c | 13 ++++++-------
 read_input.l                       |  8 ++++++++
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index 84186905a..be11ef230 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -105,7 +105,7 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
   
   
 
-  if ( mnl->solver_params.external_inverter == QUDA_INVERTER){
+  if ( mnl->external_library == QUDA_LIB){
     if(!mnl->even_odd_flag) {
       fatal_error("QUDA support only even_odd_flag",__func__);
     }
@@ -131,20 +131,19 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
     if (g_debug_level > 3){
       su3adj **given = hf->derivative;
       hf->derivative = debug_derivative;
-      int store_solver = mnl->solver;
-      mnl->solver_params.external_inverter = NO_EXT_INV;
-      mnl->solver = CG;
+      mnl->external_library = NO_EXT_LIB;
       tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
       cloverdet_derivative(id, hf);
       #ifdef TM_USE_MPI
         xchange_deri(hf->derivative);// this function use ddummy inside
       #endif
       compare_derivative(mnl, given, hf->derivative);
-      mnl->solver_params.external_inverter = QUDA_INVERTER;
-      mnl->solver = store_solver;
+      mnl->external_library = QUDA_LIB;
       hf->derivative = given;
     }
-  #endif // no other option, TM_USE_QUDA already checked by solver
+#else
+      fatal_error("in %s  external_library == QUDA_LIB requires TM_USE_QUDA to be true",__func__);
+#endif // end ifdef TM_USE_QUDA
   }
   else{
     // Y_o -> w_fields[0]
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index ddd45cb97..5b21c2829 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -215,7 +215,7 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
   chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array,
 		      mnl->csg_N, &mnl->csg_n, VOLUME/2);
 
-    if ( mnl->solver_params.external_inverter == QUDA_INVERTER){
+  if ( mnl->external_library == QUDA_LIB){
     if(!mnl->even_odd_flag) {
       fatal_error("QUDA support only even_odd_flag",__func__);
     }
@@ -241,20 +241,19 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
     if (g_debug_level > 3){
       su3adj **given = hf->derivative;
       hf->derivative = debug_derivative;
-      int store_solver = mnl->solver;
-      mnl->solver_params.external_inverter = NO_EXT_INV;
-      mnl->solver = CG;
+      mnl->external_library = NO_EXT_LIB;
       tm_debug_printf( 0, 3, "Recomputing the derivative from tmLQCD\n");
       cloverdetratio_derivative(no, hf);
       #ifdef TM_USE_MPI
         xchange_deri(hf->derivative);// this function use ddummy inside
       #endif
       compare_derivative(mnl, given, hf->derivative);
-      mnl->solver_params.external_inverter = QUDA_INVERTER;
-      mnl->solver = store_solver;
+      mnl->external_library = QUDA_LIB;
       hf->derivative = given;
     }
-  #endif // no other option, TM_USE_QUDA already checked by solver
+#else
+      fatal_error("in %s  external_library == QUDA_LIB requires TM_USE_QUDA to be true",__func__);
+#endif // end ifdef TM_USE_QUDA
   }
   else{
     // Apply Q_{-} to get Y_W -> w_fields[0]
diff --git a/read_input.l b/read_input.l
index c4e8c0764..20ef8b490 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2302,6 +2302,14 @@ static inline double fltlist_next_token(int * const list_end){
     if(myverbose) printf("  Use QPhiX inverter line %d monomial %d\n", line_of_file, current_monomial);
     mnl->solver_params.external_inverter = NO_EXT_INV;
   }
+  {SPC}*UseExternalLibrary{EQL}no {
+    mnl->external_library = NO_EXT_LIB;
+    if(myverbose) printf("  UseExternalLibrary set to false in line %d, monomial %d\n", line_of_file, current_monomial);
+  }
+  {SPC}*UseExternalLibrary{EQL}quda {
+    mnl->external_library = QUDA_LIB;
+    if(myverbose) printf("  UseExternalLibrary set to quda in line %d, monomial %d\n", line_of_file, current_monomial);
+  }
   {SPC}*UseSloppyPrecision{EQL}yes {
     if(myverbose) printf("  Use sloppy precision (single) in the inverter (if supported) line %d monomial %d\n", line_of_file, current_monomial);
     mnl->solver_params.sloppy_precision = SLOPPY_SINGLE;

From 5d3caec3e6b6133b48dc240638939a782e941393 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 16 Jan 2024 15:06:47 +0100
Subject: [PATCH 26/34] fix compiler warning

---
 monomial/cloverdet_monomial.c      | 6 ++++++
 monomial/cloverdetratio_monomial.c | 6 ++++++
 monomial/gauge_monomial.h          | 1 +
 quda_interface.c                   | 5 +----
 quda_interface.h                   | 2 +-
 5 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index be11ef230..4fcd6354a 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
+#include <string.h>
 #include "global.h"
 #include "su3.h"
 #include "su3adj.h"
@@ -49,6 +50,11 @@
 #include "operator/clovertm_operators.h"
 #include "operator/clovertm_operators_32.h"
 #include "cloverdet_monomial.h"
+#include "xchange/xchange_deri.h"
+#include "monomial/gauge_monomial.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
 
 /* think about chronological solver ! */
 
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 5b21c2829..9636df29b 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -26,6 +26,7 @@
 #include <stdio.h>
 #include <math.h>
 #include <time.h>
+#include <string.h>
 #include "global.h"
 #include "su3.h"
 #include "start.h"
@@ -45,6 +46,11 @@
 #include "monomial/monomial.h"
 #include "boundary.h"
 #include "cloverdetratio_monomial.h"
+#include "xchange/xchange_deri.h"
+#include "monomial/gauge_monomial.h"
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
 
 /* think about chronological solver ! */
 
diff --git a/monomial/gauge_monomial.h b/monomial/gauge_monomial.h
index 31d267215..1331c9964 100644
--- a/monomial/gauge_monomial.h
+++ b/monomial/gauge_monomial.h
@@ -26,5 +26,6 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf);
 void gauge_EMderivative(const int id, hamiltonian_field_t * const hf);
 void gauge_heatbath(const int id, hamiltonian_field_t * const hf);
 double gauge_acc(const int id, hamiltonian_field_t * const hf);
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native );
 
 #endif
diff --git a/quda_interface.c b/quda_interface.c
index 6b4cd4a7f..351425562 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2514,16 +2514,13 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
 
-void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor ** const X_o, spinor ** const phi, int detratio) {
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor * const X_o, spinor * const phi, int detratio) {
   tm_stopwatch_push(&g_timers, __func__, "");
   
   _initQuda();
   _initMomQuda();
   void *spinorIn;
   void *spinorPhi;
-  const int nr_sf_in = 1;
-  spinor ** in_o;
-
   
   spinorIn  = (void*)X_o;
   reorder_spinor_eo_toQuda((double*)spinorIn, inv_param.cpu_prec, 0, 1);
diff --git a/quda_interface.h b/quda_interface.h
index cf9d37faf..ca7393227 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -174,7 +174,7 @@ int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out
 
 void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf);
 void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf,
-                                     spinor ** const X_o, spinor ** const phi, int ratio);
+                                     spinor * const X_o, spinor * const phi, int ratio);
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 #endif /* QUDA_INTERFACE_H_ */

From 4b16c413372e0b19a7ec2987b892c47d1dec92d3 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 16 Jan 2024 21:20:39 +0100
Subject: [PATCH 27/34] move compare_derivative into its own compilation unit
 and make the strictness variable, specify a 'name' argument

---
 Makefile.in                        |  2 +-
 compare_derivative.c               | 67 ++++++++++++++++++++++++++++++
 compare_derivative.h               | 29 +++++++++++++
 monomial/cloverdet_monomial.c      |  4 +-
 monomial/cloverdetratio_monomial.c |  3 +-
 monomial/gauge_monomial.c          | 41 +-----------------
 monomial/gauge_monomial.h          |  1 -
 7 files changed, 103 insertions(+), 44 deletions(-)
 create mode 100644 compare_derivative.c
 create mode 100644 compare_derivative.h

diff --git a/Makefile.in b/Makefile.in
index 8b54c70cd..7aaca2e0d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -58,7 +58,7 @@ MODULES = read_input gamma measure_gauge_action start \
 	little_D block operator \
 	spinor_fft X_psi P_M_eta \
 	jacobi fatal_error invert_clover_eo gettime \
-	tm_debug_printf \
+	tm_debug_printf compare_derivative \
         @SPI_FILES@ @QUDA_INTERFACE@ @DDalphaAMG_INTERFACE@
 
 CXXMODULES = @QPHIX_INTERFACE@
diff --git a/compare_derivative.c b/compare_derivative.c
new file mode 100644
index 000000000..14ce50d61
--- /dev/null
+++ b/compare_derivative.c
@@ -0,0 +1,67 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2024 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP 
+# include <omp.h>
+#endif
+#include <stdio.h>
+#include "global.h"
+#include "monomial/monomial.h"
+
+/* this function compares two derivatives calculated by an external library and tmLQCD */
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, 
+    const double threshold, const char * name){
+  int n_diff = 0;
+
+  for(int ix = 0; ix < VOLUME; ix++){
+    for(int mu=0; mu<4; mu++){
+      double *ext=&(ext_lib[ix][mu].d1);
+      double *nat=&(native[ix][mu].d1);
+      for(int j=0; j<8; ++j){
+        double diff=ext[j]-nat[j];
+        if (sqrt(diff*diff) > threshold || isnan( ext[j] ) || isinf(ext[j]) ){
+            n_diff++;
+            printf("derivative at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d,"
+                   " ext: %-14e, native: %-14e ratio: %-14g diff %-14g  on proc_id %d\n", 
+                  g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
+                  ext[j], nat[j], ext[j]/nat[j], ext[j]-nat[j], g_proc_id);
+        }
+      }
+    }
+  }
+  if(n_diff > 0){
+    printf("%s: the deviation between tmLQCD and the external library "
+           "exceeds the threshold %.1e in %d case(s) for parameters: c0=%e c1=%e g_beta=%e on proc_id: %d\n",
+           name,
+           threshold,
+           n_diff,
+           mnl->c0,
+           mnl->c1,
+           mnl->beta,
+           g_proc_id);
+
+    if(g_strict_residual_check) fatal_error("Difference between external library and tmLQCD-native function!", 
+                                            name);
+  }
+}
+
diff --git a/compare_derivative.h b/compare_derivative.h
new file mode 100644
index 000000000..82fd3b1a7
--- /dev/null
+++ b/compare_derivative.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2024 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef COMPARE_DERIVATIVE_H
+#define COMPARE_DERIVATIVE_H
+
+#include "monomial/monomial.h"
+#include "su3adj.h"
+
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, const double threshold, const char * name);
+
+#endif
diff --git a/monomial/cloverdet_monomial.c b/monomial/cloverdet_monomial.c
index 4fcd6354a..b98a38e3c 100644
--- a/monomial/cloverdet_monomial.c
+++ b/monomial/cloverdet_monomial.c
@@ -51,7 +51,7 @@
 #include "operator/clovertm_operators_32.h"
 #include "cloverdet_monomial.h"
 #include "xchange/xchange_deri.h"
-#include "monomial/gauge_monomial.h"
+#include "compare_derivative.h"
 #ifdef TM_USE_QUDA
 #  include "quda_interface.h"
 #endif
@@ -143,7 +143,7 @@ void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) {
       #ifdef TM_USE_MPI
         xchange_deri(hf->derivative);// this function use ddummy inside
       #endif
-      compare_derivative(mnl, given, hf->derivative);
+      compare_derivative(mnl, given, hf->derivative, 1e-9, "cloverdet_derivative");
       mnl->external_library = QUDA_LIB;
       hf->derivative = given;
     }
diff --git a/monomial/cloverdetratio_monomial.c b/monomial/cloverdetratio_monomial.c
index 9636df29b..ccaf91d5a 100644
--- a/monomial/cloverdetratio_monomial.c
+++ b/monomial/cloverdetratio_monomial.c
@@ -48,6 +48,7 @@
 #include "cloverdetratio_monomial.h"
 #include "xchange/xchange_deri.h"
 #include "monomial/gauge_monomial.h"
+#include "compare_derivative.h"
 #ifdef TM_USE_QUDA
 #  include "quda_interface.h"
 #endif
@@ -253,7 +254,7 @@ void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) {
       #ifdef TM_USE_MPI
         xchange_deri(hf->derivative);// this function use ddummy inside
       #endif
-      compare_derivative(mnl, given, hf->derivative);
+      compare_derivative(mnl, given, hf->derivative, 1e-9, "cloverdetratio_derivative");
       mnl->external_library = QUDA_LIB;
       hf->derivative = given;
     }
diff --git a/monomial/gauge_monomial.c b/monomial/gauge_monomial.c
index 6b45174ab..59f6455dd 100644
--- a/monomial/gauge_monomial.c
+++ b/monomial/gauge_monomial.c
@@ -44,49 +44,12 @@
 #include "monomial/monomial.h"
 #include "hamiltonian_field.h"
 #include "gauge_monomial.h"
+#include "compare_derivative.h"
 #include "fatal_error.h"
 #ifdef TM_USE_QUDA
 #include "quda_interface.h"
 #endif
 
-/* this function compares the gauge derivative calculated by an external library and tmLQCD */
-void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native ){
-  const double threshold = 1e-7;
-  int n_diff = 0;
-
-  for(int ix = 0; ix < VOLUME; ix++){
-    for(int mu=0; mu<4; mu++){
-      double *ext=&(ext_lib[ix][mu].d1);
-      double *nat=&(native[ix][mu].d1);
-      for(int j=0; j<8; ++j){
-        double diff;
-        if (fabs(nat[j])>1e-7) diff=(ext[j]-nat[j])/nat[j];
-        else diff=ext[j]-nat[j];
-        if (sqrt(diff*diff) > threshold || isnan( ext[j] ) || isinf(ext[j]) ){
-            n_diff++;
-            printf("derivative at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d, ext: %-14e, native: %-14e ratio: %-14g diff %-14g  on proc_id %d\n", 
-                  g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
-                  ext[j], nat[j], ext[j]/nat[j], ext[j]-nat[j], g_proc_id);
-        }
-      }
-    }
-  }
-  if(n_diff > 0){
-    printf("gauge_derivative: the relative deviation between tmLQCD and the external library "
-           "exceeds the threshold %.1e in %d case(s) for parameters: c0=%e c1=%e g_beta=%e on proc_id: %d\n",
-           threshold,
-           n_diff,
-           mnl->c0,
-           mnl->c1,
-           mnl->beta,
-           g_proc_id);
-
-    if(g_strict_residual_check) fatal_error("Difference between external library and tmLQCD-native function!", 
-                                            "gauge_derivative");
-  }
-}
-
-
 /* this function calculates the derivative of the momenta: equation 13 of Gottlieb */
 void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
   monomial * mnl = &monomial_list[id];
@@ -107,7 +70,7 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf) {
      hf->derivative=ddummy;
      mnl->external_library=NO_EXT_LIB;
      gauge_derivative(id, hf);
-     compare_derivative(mnl,given, ddummy);
+     compare_derivative(mnl, given, ddummy, 1e-9, "gauge_derivative");
      mnl->external_library=QUDA_LIB;
      hf->derivative=given;
     }
diff --git a/monomial/gauge_monomial.h b/monomial/gauge_monomial.h
index 1331c9964..31d267215 100644
--- a/monomial/gauge_monomial.h
+++ b/monomial/gauge_monomial.h
@@ -26,6 +26,5 @@ void gauge_derivative(const int id, hamiltonian_field_t * const hf);
 void gauge_EMderivative(const int id, hamiltonian_field_t * const hf);
 void gauge_heatbath(const int id, hamiltonian_field_t * const hf);
 double gauge_acc(const int id, hamiltonian_field_t * const hf);
-void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native );
 
 #endif

From 28298a0d15e5791465f5aac7eb29526bcb3818d0 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 17 Jan 2024 18:16:51 +0100
Subject: [PATCH 28/34] In compare_derivative, output a status message even if
 n_diff = 0. Make sure to not miss anything by doing a MPI_MAX reduction on
 rank 0.

---
 compare_derivative.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/compare_derivative.c b/compare_derivative.c
index 14ce50d61..ab30bc0b7 100644
--- a/compare_derivative.c
+++ b/compare_derivative.c
@@ -63,5 +63,17 @@ void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native,
     if(g_strict_residual_check) fatal_error("Difference between external library and tmLQCD-native function!", 
                                             name);
   }
+
+  int red_n_diff = 0;
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Reduce(&n_diff, &red_n_diff, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+#else
+  red_n_diff = n_diff;
+#endif
+  if(g_proc_id == 0){
+    printf("The maximum number of deviations in %s exceeding the threshold %.1e was %d\n",
+           name, threshold, red_n_diff);
+  }
 }
 

From ac74bfeec6a042f2f0f47a8f7f97dd4210e46e03 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 19 Jan 2024 14:07:13 +0100
Subject: [PATCH 29/34] documentation of UseExternalLibrary

---
 doc/input.tex | 12 ++++++++++++
 doc/quda.tex  |  4 +++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/doc/input.tex b/doc/input.tex
index 780e1afee..567b78e90 100644
--- a/doc/input.tex
+++ b/doc/input.tex
@@ -432,6 +432,10 @@ \subsection{Input parameter for main program}
 \item {\ttfamily CLOVERDET}:
   \begin{itemize}
   \item {\ttfamily csw}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+  \item {\ttfamily UseExternalLibrary} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 \item {\ttfamily DET, CLOVERDET}:
   \begin{itemize}
@@ -468,6 +472,12 @@ \subsection{Input parameter for main program}
 %
 \item {\ttfamily CLOVERDETRATIO}: 
   see {\ttfamily CLOVERDET} and {\ttfamily DETRATIO}.
+  \begin{itemize}
+    \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+  \item {\ttfamily UseExternalLibrary} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+  \end{itemize}
 %
 \item {\ttfamily GAUGE}: 
   \begin{itemize}
@@ -490,6 +500,8 @@ \subsection{Input parameter for main program}
   \item {\ttfamily RectangleCoefficient}: the value of the parameter
     $c_1$. The coefficient $c_0$ is computed from $c_0 = 1-8c_1$. Is
     effective only for {\ttfamily type = user}.
+   \item {\ttfamily UseExternalLibrary} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
   There is maximally one instance allowed of this type.
   
diff --git a/doc/quda.tex b/doc/quda.tex
index 85fa14095..02599f84c 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -13,7 +13,9 @@ \subsubsection{Design goals of the interface}
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
 	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
-	\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseQudaInverter} is introduced which, when set to {\ttfamily yes}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} are supported.\footnote{{\ttfamily DBCLOVER} is supported by the interface but not by QUDA as of version 0.7.0.}
+	\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily yes}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} are supported.\footnote{{\ttfamily DBCLOVER} is supported by the interface but not by QUDA as of version 0.7.0.}.\\
+    Within the monomial declarations of the input file (between {\ttfamily BeginMonomial} and {\ttfamily EndMonomial}) the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial.
+	The monomials supperted are: {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO}.
 	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). In the file {\ttfamily operators.c}, the QUDA library is initialized when an operator is initialized which has set {\ttfamily UseQudaInverter = yes}. There, the actual call to the inverter is conditionally replaced with a call to the QUDA interface.
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}

From 44449ad7287e6f5b9a49299aafe20f69d6d4a27d Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 19 Jan 2024 14:49:15 +0100
Subject: [PATCH 30/34] remove duplicate item DET,CLOVERDET

---
 doc/input.tex | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/doc/input.tex b/doc/input.tex
index 567b78e90..74efa332b 100644
--- a/doc/input.tex
+++ b/doc/input.tex
@@ -425,21 +425,10 @@ \subsection{Input parameter for main program}
 Each of them has different options
 :
 \begin{itemize}
-\item {\ttfamily DET, CLOVERDET}:
-  \begin{itemize}
-  \item {\ttfamily 2KappaMu}
-  \end{itemize}
-\item {\ttfamily CLOVERDET}:
-  \begin{itemize}
-  \item {\ttfamily csw}
-  \item {\ttfamily UseExternalInverter} 
-  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
-  \item {\ttfamily UseExternalLibrary} 
-  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
-  \end{itemize}
 \item {\ttfamily DET, CLOVERDET}:
   \begin{itemize}
   \item {\ttfamily Kappa}
+  \item {\ttfamily 2KappaMu}
   \item {\ttfamily Timescale}: the timescale on which to integrate
     this monomial. Counting starts from zero up to the total number of
     timescales minus 1.
@@ -460,6 +449,15 @@ \subsection{Input parameter for main program}
     default is {\ttfamily DET}
   \end{itemize}
 %
+\item {\ttfamily CLOVERDET}:
+\begin{itemize}
+	\item {\ttfamily csw}
+	\item {\ttfamily UseExternalInverter} 
+	Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+	\item {\ttfamily UseExternalLibrary} 
+	Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+\end{itemize}
+%
 \item {\ttfamily DETRATIO}: the same as for {\ttfamily DET}, but in
   addition:
   \begin{itemize}

From 09d879763a0ca28afdedfc69fd2db9f4c238a65c Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 19 Jan 2024 15:05:42 +0100
Subject: [PATCH 31/34] documentation: move UseExternalInverter to DET and
 DETRATIO

---
 doc/input.tex | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/doc/input.tex b/doc/input.tex
index 74efa332b..85610ed36 100644
--- a/doc/input.tex
+++ b/doc/input.tex
@@ -447,13 +447,13 @@ \subsection{Input parameter for main program}
   \item {\ttfamily HB\_Solver}: the solver to be used in the heatbath step, see section \ref{sec:hb.solver} for details.
   \item {\ttfamily Name}: a name to be assigned to the monomial. The
     default is {\ttfamily DET}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 %
 \item {\ttfamily CLOVERDET}:
 \begin{itemize}
 	\item {\ttfamily csw}
-	\item {\ttfamily UseExternalInverter} 
-	Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
 	\item {\ttfamily UseExternalLibrary} 
 	Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
 \end{itemize}
@@ -466,16 +466,13 @@ \subsection{Input parameter for main program}
   \item {\ttfamily Name}: 
     a name to be assigned to the monomial. 
     The default is {\ttfamily DETRATIO}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 %
 \item {\ttfamily CLOVERDETRATIO}: 
   see {\ttfamily CLOVERDET} and {\ttfamily DETRATIO}.
-  \begin{itemize}
-    \item {\ttfamily UseExternalInverter} 
-  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
-  \item {\ttfamily UseExternalLibrary} 
-  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
-  \end{itemize}
+ 
 %
 \item {\ttfamily GAUGE}: 
   \begin{itemize}

From fa3b0518f66859c2fff2716a73d628ddfb5cff4c Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Fri, 19 Jan 2024 16:28:56 +0100
Subject: [PATCH 32/34] add check if UseExternalLibrary=quda you must also have
 UseExternalInverter=quda

---
 read_input.l | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/read_input.l b/read_input.l
index 3b39524dc..97d0e36b1 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2340,6 +2340,10 @@ static inline double fltlist_next_token(int * const list_end){
   }
   {SPC}*UseExternalLibrary{EQL}quda {
     mnl->external_library = QUDA_LIB;
+    if(mnl->solver_params.external_inverter != QUDA_INVERTER){
+       printf("Error:  usage of UseExternalLibrary = quda in line %d, monomial %d is not supported without UseExternalInverter = quda\n", line_of_file, current_monomial);
+       exit(1);
+    }
     if(myverbose) printf("  UseExternalLibrary set to quda in line %d, monomial %d\n", line_of_file, current_monomial);
   }
   {SPC}*UseSloppyPrecision{EQL}yes {

From a86fee5d506fd36e0948c42179c77252037d3e4d Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Tue, 23 Jan 2024 16:55:55 +0100
Subject: [PATCH 33/34] fix check of UseExternalLibrary=quda &&
 UseExternalInverter=quda

---
 monomial/monomial.c | 12 ++++++++++++
 read_input.l        |  4 ----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/monomial/monomial.c b/monomial/monomial.c
index b7e8a55c5..ee3ddea02 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -248,6 +248,12 @@ int init_monomials(const int V, const int even_odd_flag) {
           monomial_list[i].Qm = &Qsw_full_minus_psi;
         }
         init_swpm(VOLUME);
+        if(monomial_list[i].external_library==QUDA_LIB){
+          if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
+            tm_debug_printf(0,0,"Error:  CLOVERDET monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            exit(1);
+          }
+        }
         clover_monomials[no_clover_monomials] = i;
         no_clover_monomials++;
         if(g_proc_id == 0 && g_debug_level > 1) {
@@ -271,6 +277,12 @@ int init_monomials(const int V, const int even_odd_flag) {
         monomial_list[i].Qp = &Qsw_plus_psi;
         monomial_list[i].Qm = &Qsw_minus_psi;
         init_swpm(VOLUME);
+        if(monomial_list[i].external_library==QUDA_LIB){
+          if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
+            tm_debug_printf(0,0,"Error:  CLOVERDETRATIO monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            exit(1);
+          }
+        }
         if(g_proc_id == 0 && g_debug_level > 1) {
           printf("# Initialised monomial %s of type CLOVERDETRATIO, no_monomials= %d\n",
               monomial_list[i].name,
diff --git a/read_input.l b/read_input.l
index 97d0e36b1..3b39524dc 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2340,10 +2340,6 @@ static inline double fltlist_next_token(int * const list_end){
   }
   {SPC}*UseExternalLibrary{EQL}quda {
     mnl->external_library = QUDA_LIB;
-    if(mnl->solver_params.external_inverter != QUDA_INVERTER){
-       printf("Error:  usage of UseExternalLibrary = quda in line %d, monomial %d is not supported without UseExternalInverter = quda\n", line_of_file, current_monomial);
-       exit(1);
-    }
     if(myverbose) printf("  UseExternalLibrary set to quda in line %d, monomial %d\n", line_of_file, current_monomial);
   }
   {SPC}*UseSloppyPrecision{EQL}yes {

From 6d2f3fe30fd10c88249cccc92fe5b861ed9b2ae5 Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 25 Jan 2024 15:56:16 +0100
Subject: [PATCH 34/34] add configure flag
 --enable-quda_fermionic_forces=yes/no for old versions of QUDA

---
 configure.in                        | 10 ++++++++++
 doc/quda.tex                        | 16 ++++++++++++++++
 include/tmlqcd_config_internal.h.in |  3 +++
 monomial/monomial.c                 |  4 ++--
 quda_interface.c                    |  8 +++++++-
 5 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/configure.in b/configure.in
index da1ea9046..5aaf15dfd 100644
--- a/configure.in
+++ b/configure.in
@@ -964,6 +964,16 @@ if test $enable_quda_experimental = yes; then
 else
   AC_MSG_RESULT(no)
 fi
+AC_MSG_CHECKING(whether the QUDA force is enabled)
+AC_ARG_ENABLE(quda_fermionic_forces,
+  AS_HELP_STRING([--enable-quda_fermionic_forces], [enable support for fermionic forces using QUDA [default=yes]]),
+  enable_quda_fermionic_forces=$enableval, enable_quda_fermionic_forces=yes)
+if test $enable_quda_fermionic_forces = no; then
+  AC_MSG_RESULT(no)
+else
+  AC_MSG_RESULT(yes)
+  AC_DEFINE(TM_QUDA_FERMIONIC_FORCES,1, fermionic forces with QUDA are enabled)
+fi
 
 # QPhiX library for Intel Xeon and Xeon Phis
 AC_MSG_CHECKING(whether we want to use QPhiX)
diff --git a/doc/quda.tex b/doc/quda.tex
index 9680c6210..281be3d1e 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -70,6 +70,22 @@ \subsubsection{Installation}
 \end{verbatim}
 Note that a {\ttfamily C++} compiler is required for linking against the QUDA library, therefore set {\ttfamily CXX} appropriately. {\ttfamily \${QUDADIR}} is where you installed QUDA in the previous step and {\ttfamily \${CUDADIR}} is required again for linking.
 
+\subsubsection{QUDA versions}
+
+If you need a version of QUDA after https://github.com/lattice/quda/commit/50864ffde1bd8f46fd4a2a2b2e6d44a5a588e2c2 you nee to configure with 
+\begin{verbatim}
+  --enable-quda_experimental=yes
+\end{verbatim}
+
+If you need a version of QUDA before \url{https://github.com/lattice/quda/commit/fd50676db06fc36efb3a791a3059c57cca70bb55} you need to add in the configuration script the option
+\begin{verbatim}
+  --enable-quda_fermionic_forces=no
+\end{verbatim}
+so that the wrapper to the QUDA fermionic forces is not compiled,
+thus if \texttt{--enable-quda_fermionic_forces=no} setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} monomials
+is not supported and tmLQCD will stop with an error.
+
+
 \subsubsection{Usage}
 Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseExternalInverter} flag to {\ttfamily quda}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as
 \begin{verbatim}
diff --git a/include/tmlqcd_config_internal.h.in b/include/tmlqcd_config_internal.h.in
index a90b18c55..1a187466a 100644
--- a/include/tmlqcd_config_internal.h.in
+++ b/include/tmlqcd_config_internal.h.in
@@ -203,6 +203,9 @@
 /* Using experimental QUDA version */
 #undef TM_QUDA_EXPERIMENTAL
 
+/* Using QUDA fermionic forces */
+#undef TM_QUDA_FERMIONIC_FORCES
+
 /* Using DDalphaAMG */
 #undef DDalphaAMG
 
diff --git a/monomial/monomial.c b/monomial/monomial.c
index ee3ddea02..2227e3897 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -250,7 +250,7 @@ int init_monomials(const int V, const int even_odd_flag) {
         init_swpm(VOLUME);
         if(monomial_list[i].external_library==QUDA_LIB){
           if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
-            tm_debug_printf(0,0,"Error:  CLOVERDET monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            tm_debug_printf(0,0,"Error: CLOVERDET monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
             exit(1);
           }
         }
@@ -279,7 +279,7 @@ int init_monomials(const int V, const int even_odd_flag) {
         init_swpm(VOLUME);
         if(monomial_list[i].external_library==QUDA_LIB){
           if(monomial_list[i].solver_params.external_inverter != QUDA_INVERTER){
-            tm_debug_printf(0,0,"Error:  CLOVERDETRATIO monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
+            tm_debug_printf(0,0,"Error: CLOVERDETRATIO monomial of UseExternalLibrary = quda is not supported without UseExternalInverter = quda\n");
             exit(1);
           }
         }
diff --git a/quda_interface.c b/quda_interface.c
index 32d13a327..9789c1b7a 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2541,6 +2541,7 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
 
+#ifdef TM_QUDA_FERMIONIC_FORCES
 void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor * const X_o, spinor * const phi, int detratio) {
   tm_stopwatch_push(&g_timers, __func__, "");
   
@@ -2582,7 +2583,12 @@ void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t
   }
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
-
+#else 
+void compute_cloverdet_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf, spinor * const X_o, spinor * const phi, int detratio) {
+  tm_debug_printf(0,0,"Error:   UseExternalLibrary = quda requires that tmLQCD is compiled with --enable-quda_fermionic=yes\n");
+  exit(1);
+}
+#endif
 
 
 void  compute_WFlow_quda(const double eps, const double tmax, const int traj, FILE* outfile){