From 489ee3d3a4f55d1d6ab57ccdfaa13bb318a1fae8 Mon Sep 17 00:00:00 2001
From: Aniket Sen <sen@qbig.cluster.hiskp>
Date: Mon, 13 Mar 2023 15:58:32 +0100
Subject: [PATCH 01/49] initial commit, interface works with quda

---
 monomial/ndrat_monomial.c |  17 ++++--
 quda_interface.c          | 106 ++++++++++++++++++++++++++++++++++++++
 quda_interface.h          |  16 ++++++
 solver/eigenvalues_bi.c   |  37 ++++++++++++-
 4 files changed, 169 insertions(+), 7 deletions(-)

diff --git a/monomial/ndrat_monomial.c b/monomial/ndrat_monomial.c
index 1dd669b33..fd4018027 100644
--- a/monomial/ndrat_monomial.c
+++ b/monomial/ndrat_monomial.c
@@ -213,11 +213,7 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
     copy_32_sw_fields();
   }
-  // we measure before the trajectory!
-  if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
-    if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
-    else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi);
-  }
+  
 
   // the Gaussian distributed random fields
   tm_stopwatch_push(&g_timers, "random_energy0", "");
@@ -243,6 +239,17 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
   }
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
+  initQudaforEig(mnl->solver_params.squared_solver_prec, mnl->solver_params.max_iter,
+                                                       mnl->solver_params.type, mnl->solver_params.rel_prec,
+                                                       1, // we only support even-odd here
+                                                       mnl->solver_params.refinement_precision,
+                                                       mnl->solver_params.sloppy_precision,
+                                                       mnl->solver_params.compression_type);
+  // we measure before the trajectory!
+  if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
+    if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
+    else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi);
+  }
   mnl->iter0 = solve_mms_nd_plus(g_chi_up_spinor_field, g_chi_dn_spinor_field,
                                  mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
diff --git a/quda_interface.c b/quda_interface.c
index 36c7974ed..d454325cd 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -148,6 +148,10 @@ QudaEigParam mg_eig_param[QUDA_MAX_MG_LEVEL];
 // input params specific to tmLQCD QUDA interface
 tm_QudaParams_t quda_input;
 
+
+// parameters for the eigensolver
+QudaEigParam eig_param;
+
 // pointer to the QUDA gaugefield
 double *gauge_quda[4];
 
@@ -2323,6 +2327,47 @@ int invert_eo_quda_oneflavour_mshift(spinor ** const out,
   return(iterations);
 }
 
+void initQudaforEig(const double precision, const int max_iter, 
+                    const int solver_flag, const int rel_prec,
+                    const int even_odd_flag, const SloppyPrecision refinement_precision, 
+                    SloppyPrecision sloppy_precision, CompressionType compression) {
+  
+
+  // it returns if quda is already init
+  _initQuda();
+
+  if ( rel_prec )
+    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+  else
+    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
+
+  inv_param.kappa = g_kappa;
+
+  // figure out which BC to use (theta, trivial...)
+  set_boundary_conditions(&compression, &gauge_param);
+  // set the sloppy precision of the mixed prec solver
+  set_sloppy_prec(sloppy_precision, refinement_precision, &gauge_param, &inv_param);
+  
+  // load gauge after setting precision
+  _loadGaugeQuda(compression);
+
+  _setTwoFlavourSolverParam(g_kappa,
+                            g_c_sw,
+                            g_mubar,
+                            g_epsbar,
+                            solver_flag,
+                            even_odd_flag,
+                            precision,
+                            max_iter,
+                            1 /*single_parity_solve */,
+                            1 /*always QpQm*/);
+
+  // QUDA applies the MMdag operator, we need QpQm^{-1) in the end
+  // so we want QUDA to use the MdagM operator
+  inv_param.dagger = QUDA_DAG_YES;
+
+}
+
 int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out_dn,
                                      spinor * const in_up, spinor * const in_dn,
                                      const double precision, const int max_iter,
@@ -2573,3 +2618,64 @@ void  compute_WFlow_quda(const double eps, const double tmax, const int traj, FI
   free(obs_param);
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 }
+
+
+
+
+/********************************************************
+
+Interface function for Eigensolver on Quda
+
+*********************************************************/
+
+
+void eigsolveQuda(int n, int lda, double tau, double tol, 
+        int kmax, int jmax, int jmin, int itmax,
+        int blksize, int blkwise, 
+        int V0dim, _Complex double *V0, 
+        int solver_flag, 
+        int linitmax, double eps_tr, double toldecay,
+        int verbosity,
+        int *k_conv, _Complex double ** host_evecs, _Complex double *host_evals, int *it,
+        int maxmin, const int shift_mode) {
+
+  eig_param = newQudaEigParam();
+
+  eig_param.invert_param = &inv_param;
+  eig_param.tol = tol;
+  eig_param.qr_tol = tol;
+  //eig_param.invert_param->verbosity = QUDA_DEBUG_VERBOSE;
+  if(blkwise == 1) {
+    eig_param.eig_type = QUDA_EIG_BLK_IR_ARNOLDI;
+    eig_param.block_size = blksize;
+  }else {
+    eig_param.eig_type = QUDA_EIG_IR_ARNOLDI;
+    eig_param.block_size = 1;
+  }
+  eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE;
+  eig_param.preserve_deflation = QUDA_BOOLEAN_FALSE;
+  eig_param.use_dagger = QUDA_BOOLEAN_TRUE;
+  eig_param.use_norm_op = QUDA_BOOLEAN_TRUE;
+  eig_param.use_pc = QUDA_BOOLEAN_FALSE;
+  eig_param.use_eigen_qr = QUDA_BOOLEAN_FALSE;
+  eig_param.compute_svd = QUDA_BOOLEAN_FALSE;
+  eig_param.compute_gamma5 = QUDA_BOOLEAN_FALSE;
+  if(maxmin == 1) eig_param.spectrum = QUDA_SPECTRUM_LM_EIG;
+  else eig_param.spectrum = QUDA_SPECTRUM_SM_EIG;
+
+  //eig_param.save_prec = inv_param.cuda_prec_eigensolver;
+  eig_param.invert_param->cuda_prec_eigensolver = inv_param.cuda_prec;
+  eig_param.invert_param->clover_cuda_prec_eigensolver = inv_param.cuda_prec;
+
+  strncpy(eig_param.vec_outfile,"",256);
+  
+
+  eig_param.n_conv = 1;
+  eig_param.n_ev = 1;
+  eig_param.n_kr = 96;
+
+  eig_param.max_restarts = linitmax;
+
+  eigensolveQuda((void **)host_evecs, host_evals, &eig_param);
+
+}
diff --git a/quda_interface.h b/quda_interface.h
index d555544d0..d96f081b4 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -174,4 +174,20 @@ int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out
 void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * const hf);
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
+
+void eigsolveQuda(int n, int lda, double tau, double tol, 
+        int kmax, int jmax, int jmin, int itmax,
+        int blksize, int blkwise, 
+        int V0dim, _Complex double *V0, 
+        int solver_flag, 
+        int linitmax, double eps_tr, double toldecay,
+        int verbosity,
+        int *k_conv, _Complex double ** host_evecs, _Complex double *host_evals, int *it,
+        int maxmin, const int shift_mode);
+
+void initQudaforEig(const double precision, const int max_iter, 
+                    const int solver_flag, const int rel_prec,
+                    const int even_odd_flag, const SloppyPrecision refinement_precision, 
+                    SloppyPrecision sloppy_precision, CompressionType compression);
+
 #endif /* QUDA_INTERFACE_H_ */
diff --git a/solver/eigenvalues_bi.c b/solver/eigenvalues_bi.c
index 63d78e483..e610f48d2 100644
--- a/solver/eigenvalues_bi.c
+++ b/solver/eigenvalues_bi.c
@@ -54,6 +54,10 @@
 #include "eigenvalues_bi.h"
 #include "operator/tm_operators_nd.h"
 
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+
 
 double eigenvalues_bi(int * nr_of_eigenvalues,  
 		      const int max_iterations, const double precision,
@@ -64,6 +68,8 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
   static int allocated = 0;
   static bispinor  *eigenvectors_bi = NULL;
   static double * eigenvls_bi = NULL;
+  static _Complex double * eigenvls_quda = NULL;
+  static _Complex double ** eigenvectors_quda = NULL;
 
   /**********************
    * For Jacobi-Davidson 
@@ -126,6 +132,10 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
     eigenvls_bi = (double*)malloc((*nr_of_eigenvalues)*sizeof(double));
   }
 
+  eigenvls_quda = (_Complex double *)malloc((*nr_of_eigenvalues)*sizeof(_Complex double));
+  eigenvectors_bi_= calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
+  eigenvectors_quda = calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
+
   /* compute eigenvalues */
 
   if((g_proc_id==0) && (g_debug_level > 4)) {
@@ -135,7 +145,7 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
   /* here n and lda are equal, because Q_Qdagger_ND_BI does an internal */
   /* conversion to non _bi fields which are subject to xchange_fields   */
   /* so _bi fields do not need boundary                                 */
-  jdher_bi((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double),
+  /*jdher_bi((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double),
 	   startvalue, prec, 
 	   (*nr_of_eigenvalues), j_max, j_min, 
 	   max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_bi,
@@ -144,9 +154,32 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
 	   &converged, (_Complex double*) eigenvectors_bi, eigenvls_bi,
 	   &returncode, maxmin, 1,
 	   Qsq);
+
+  if(g_proc_id == g_stdio_proc) {
+    printf("\n*****************************\nThis is for testing\n\n");
+    printf("Eigenvalue from tmLQCD = %e\n\n",eigenvls_bi[0]);
+  }*/
+
+  if(g_proc_id == g_stdio_proc) {
+    printf("Using QUDA now.\n");
+  }
+
+  eigsolveQuda((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double),
+     startvalue, prec, 
+     (*nr_of_eigenvalues), j_max, j_min, 
+     max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_bi,
+     BICGSTAB, solver_it_max,
+     threshold, decay, verbosity,
+     &converged, (_Complex double**) eigenvectors_quda, eigenvls_quda,
+     &returncode, maxmin, 1);
+
+  if(g_proc_id == g_stdio_proc) {
+    printf("Eigenvalue from Quda = %e\n\n",eigenvls_quda[0]);
+  }
+
   
   *nr_of_eigenvalues = converged;
 
-  returnvalue = eigenvls_bi[0];
+  returnvalue = eigenvls_quda[0];
   return(returnvalue);
 }

From ee79b0f7e4537f162085740d39dc390ca9d6a468 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Fri, 24 Mar 2023 15:33:44 +0100
Subject: [PATCH 02/49] moved initQudaforEig into phmc_compute_ev

---
 monomial/ndrat_monomial.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/monomial/ndrat_monomial.c b/monomial/ndrat_monomial.c
index fd4018027..1e37d9b51 100644
--- a/monomial/ndrat_monomial.c
+++ b/monomial/ndrat_monomial.c
@@ -239,12 +239,7 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
   }
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
-  initQudaforEig(mnl->solver_params.squared_solver_prec, mnl->solver_params.max_iter,
-                                                       mnl->solver_params.type, mnl->solver_params.rel_prec,
-                                                       1, // we only support even-odd here
-                                                       mnl->solver_params.refinement_precision,
-                                                       mnl->solver_params.sloppy_precision,
-                                                       mnl->solver_params.compression_type);
+  
   // we measure before the trajectory!
   if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
     if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);

From 7a48974f53d9c75c8b0f25e1b45201b90f3f49fc Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Fri, 24 Mar 2023 15:36:15 +0100
Subject: [PATCH 03/49] initialize QUDA and eigenvals from QUDA are scaled

---
 phmc.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/phmc.c b/phmc.c
index 009457c4c..71e19bf81 100644
--- a/phmc.c
+++ b/phmc.c
@@ -40,6 +40,10 @@
 #include "solver/matrix_mult_typedef_bi.h"
 #include "gettime.h"
 
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+
 //                                          --> in  monomial
 double phmc_Cpol;                        // --> MDPolyLocNormConst
 double phmc_cheb_evmin, phmc_cheb_evmax; // --> EVMin, EVMax
@@ -222,12 +226,31 @@ void phmc_compute_ev(const int trajectory_counter,
     printf("# Computing eigenvalues for heavy doublet\n");
   }
 
-  no_eigenvalues = 1;
+#ifdef TM_USE_QUDA
+  /* Here we initialize QUDA */
+  initQudaforEig(mnl->solver_params.squared_solver_prec, mnl->solver_params.max_iter,
+                 mnl->solver_params.type, mnl->solver_params.rel_prec, 1, // we only support even-odd here
+                 mnl->solver_params.refinement_precision, mnl->solver_params.sloppy_precision, mnl->solver_params.compression_type);
 
+#endif
+
+  no_eigenvalues = 1;
   temp = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq);
+
+#ifdef TM_USE_QUDA
+  if(mnl->EVMax == 1.) {
+    temp = temp / mnl->StildeMax;
+  }
+#endif
   
   no_eigenvalues = 1;
   temp2 = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq);
+
+#ifdef TM_USE_QUDA
+  if(mnl->EVMax == 1.) {
+    temp2 = temp2 / mnl->StildeMax;
+  }
+#endif
   
   if((g_proc_id == 0) && (g_debug_level > 1)) {
     printf("# %s: lowest eigenvalue end of trajectory %d = %e\n", 

From e030595355d11ef81decfdc02356c811567c2250 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Fri, 24 Mar 2023 15:37:27 +0100
Subject: [PATCH 04/49] call eigensolver on QUDA if TM_USE_QUDA is defined

---
 solver/eigenvalues_bi.c | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/solver/eigenvalues_bi.c b/solver/eigenvalues_bi.c
index e610f48d2..aba826c3c 100644
--- a/solver/eigenvalues_bi.c
+++ b/solver/eigenvalues_bi.c
@@ -69,7 +69,6 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
   static bispinor  *eigenvectors_bi = NULL;
   static double * eigenvls_bi = NULL;
   static _Complex double * eigenvls_quda = NULL;
-  static _Complex double ** eigenvectors_quda = NULL;
 
   /**********************
    * For Jacobi-Davidson 
@@ -134,7 +133,6 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
 
   eigenvls_quda = (_Complex double *)malloc((*nr_of_eigenvalues)*sizeof(_Complex double));
   eigenvectors_bi_= calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
-  eigenvectors_quda = calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
 
   /* compute eigenvalues */
 
@@ -142,10 +140,24 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
     printf(" Values of   mu = %e     mubar = %e     eps = %e     precision = %e  \n \n", g_mu, g_mubar, g_epsbar, precision);
   }
 
+  /* For now, using the TM_USE_QUDA flag
+   * Ideally, one would use an operator flag
+   * like useExternalEigSolver. */
+#ifdef TM_USE_QUDA
+
+  if(g_proc_id == g_stdio_proc) {
+    printf("Using external eigensolver on QUDA.\n");
+  }
+
+  eigsolveQuda((*nr_of_eigenvalues), eigenvls_quda, prec, 
+                blocksize, blockwise, max_iterations, maxmin);
+
+#else
+
   /* here n and lda are equal, because Q_Qdagger_ND_BI does an internal */
   /* conversion to non _bi fields which are subject to xchange_fields   */
   /* so _bi fields do not need boundary                                 */
-  /*jdher_bi((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double),
+  jdher_bi((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double),
 	   startvalue, prec, 
 	   (*nr_of_eigenvalues), j_max, j_min, 
 	   max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_bi,
@@ -155,28 +167,7 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
 	   &returncode, maxmin, 1,
 	   Qsq);
 
-  if(g_proc_id == g_stdio_proc) {
-    printf("\n*****************************\nThis is for testing\n\n");
-    printf("Eigenvalue from tmLQCD = %e\n\n",eigenvls_bi[0]);
-  }*/
-
-  if(g_proc_id == g_stdio_proc) {
-    printf("Using QUDA now.\n");
-  }
-
-  eigsolveQuda((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double),
-     startvalue, prec, 
-     (*nr_of_eigenvalues), j_max, j_min, 
-     max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_bi,
-     BICGSTAB, solver_it_max,
-     threshold, decay, verbosity,
-     &converged, (_Complex double**) eigenvectors_quda, eigenvls_quda,
-     &returncode, maxmin, 1);
-
-  if(g_proc_id == g_stdio_proc) {
-    printf("Eigenvalue from Quda = %e\n\n",eigenvls_quda[0]);
-  }
-
+#endif
   
   *nr_of_eigenvalues = converged;
 

From 14c4e9f101288be159872a95fddc4d662db9194e Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Fri, 24 Mar 2023 15:38:02 +0100
Subject: [PATCH 05/49] clean up and finalize eigparams for QUDA

---
 quda_interface.c | 74 ++++++++++++++++++++++++++++++++----------------
 quda_interface.h | 12 ++------
 2 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index d454325cd..b037f92cf 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2629,53 +2629,79 @@ Interface function for Eigensolver on Quda
 *********************************************************/
 
 
-void eigsolveQuda(int n, int lda, double tau, double tol, 
-        int kmax, int jmax, int jmin, int itmax,
-        int blksize, int blkwise, 
-        int V0dim, _Complex double *V0, 
-        int solver_flag, 
-        int linitmax, double eps_tr, double toldecay,
-        int verbosity,
-        int *k_conv, _Complex double ** host_evecs, _Complex double *host_evals, int *it,
-        int maxmin, const int shift_mode) {
+void eigsolveQuda(int n, _Complex double *host_evals, double tol, 
+                  int blksize, int blkwise,
+                  int max_iterations,  int maxmin) {
 
   eig_param = newQudaEigParam();
 
   eig_param.invert_param = &inv_param;
   eig_param.tol = tol;
   eig_param.qr_tol = tol;
-  //eig_param.invert_param->verbosity = QUDA_DEBUG_VERBOSE;
+
+    
   if(blkwise == 1) {
-    eig_param.eig_type = QUDA_EIG_BLK_IR_ARNOLDI;
+    eig_param.eig_type = QUDA_EIG_BLK_TR_LANCZOS;
     eig_param.block_size = blksize;
   }else {
-    eig_param.eig_type = QUDA_EIG_IR_ARNOLDI;
+    eig_param.eig_type = QUDA_EIG_TR_LANCZOS;
     eig_param.block_size = 1;
   }
+
+  if(eig_param.invert_param->solve_type == QUDA_NORMOP_PC_SOLVE) {
+    eig_param.use_pc = QUDA_BOOLEAN_TRUE;
+    eig_param.use_norm_op = QUDA_BOOLEAN_TRUE;
+  }else if(eig_param.invert_param->solve_type == QUDA_DIRECT_PC_SOLVE) {
+    eig_param.use_pc = QUDA_BOOLEAN_TRUE;
+    eig_param.use_norm_op = QUDA_BOOLEAN_FALSE;
+  }else if(eig_param.invert_param->solve_type == QUDA_NORMOP_SOLVE) {
+    eig_param.use_pc = QUDA_BOOLEAN_FALSE;
+    eig_param.use_norm_op = QUDA_BOOLEAN_TRUE;
+  }else {
+    eig_param.use_pc = QUDA_BOOLEAN_FALSE;
+    eig_param.use_norm_op = QUDA_BOOLEAN_FALSE;
+  }
+
+  /* Not using polynomial acceleration for now.
+   * Might be useful to add the support. */
   eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE;
-  eig_param.preserve_deflation = QUDA_BOOLEAN_FALSE;
-  eig_param.use_dagger = QUDA_BOOLEAN_TRUE;
-  eig_param.use_norm_op = QUDA_BOOLEAN_TRUE;
-  eig_param.use_pc = QUDA_BOOLEAN_FALSE;
-  eig_param.use_eigen_qr = QUDA_BOOLEAN_FALSE;
+  
+  /* Daggers the operator. Not necessary for 
+   * most cases. */
+  eig_param.use_dagger = QUDA_BOOLEAN_FALSE;
+  
+  /* Most likely not necessary. Set TRUE to use 
+   * Eigen routines to eigensolve the upper Hessenberg via QR */
+  eig_param.use_eigen_qr = QUDA_BOOLEAN_FALSE;    
+
   eig_param.compute_svd = QUDA_BOOLEAN_FALSE;
+
+  /* Set TRUE to performs the \gamma_5 OP solve by 
+   * post multipling the eignvectors with \gamma_5 
+   * before computing the eigenvalues */
   eig_param.compute_gamma5 = QUDA_BOOLEAN_FALSE;
-  if(maxmin == 1) eig_param.spectrum = QUDA_SPECTRUM_LM_EIG;
-  else eig_param.spectrum = QUDA_SPECTRUM_SM_EIG;
 
-  //eig_param.save_prec = inv_param.cuda_prec_eigensolver;
+
+  if(maxmin == 1) eig_param.spectrum = QUDA_SPECTRUM_LR_EIG;
+  else eig_param.spectrum = QUDA_SPECTRUM_SR_EIG;
+
+  /* The following two are set to cuda_prec, otherwise 
+   * it gives an error. Such high precision might not be
+   * necessary. But have not found a way to consistently set
+   * the different precisions. */
   eig_param.invert_param->cuda_prec_eigensolver = inv_param.cuda_prec;
   eig_param.invert_param->clover_cuda_prec_eigensolver = inv_param.cuda_prec;
 
   strncpy(eig_param.vec_outfile,"",256);
+  strncpy(eig_param.vec_infile,"",256);
   
 
-  eig_param.n_conv = 1;
-  eig_param.n_ev = 1;
+  eig_param.n_conv = n;
+  eig_param.n_ev = n;
   eig_param.n_kr = 96;
 
-  eig_param.max_restarts = linitmax;
+  eig_param.max_restarts = max_iterations;
 
-  eigensolveQuda((void **)host_evecs, host_evals, &eig_param);
+  eigensolveQuda(NULL, host_evals, &eig_param);
 
 }
diff --git a/quda_interface.h b/quda_interface.h
index d96f081b4..0babe4a27 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -175,15 +175,9 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 
-void eigsolveQuda(int n, int lda, double tau, double tol, 
-        int kmax, int jmax, int jmin, int itmax,
-        int blksize, int blkwise, 
-        int V0dim, _Complex double *V0, 
-        int solver_flag, 
-        int linitmax, double eps_tr, double toldecay,
-        int verbosity,
-        int *k_conv, _Complex double ** host_evecs, _Complex double *host_evals, int *it,
-        int maxmin, const int shift_mode);
+void eigsolveQuda(int n, _Complex double *host_evals, double tol, 
+                  int blksize, int blkwise,
+                  int max_iterations,  int maxmin);
 
 void initQudaforEig(const double precision, const int max_iter, 
                     const int solver_flag, const int rel_prec,

From cc247bdf19bf1a0e8285dd4dc1cbe28d73e6fdae Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Fri, 24 Mar 2023 16:09:43 +0100
Subject: [PATCH 06/49] fix bug with the array storing the eigenvalues

---
 solver/eigenvalues_bi.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/solver/eigenvalues_bi.c b/solver/eigenvalues_bi.c
index aba826c3c..3ab67df53 100644
--- a/solver/eigenvalues_bi.c
+++ b/solver/eigenvalues_bi.c
@@ -68,7 +68,6 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
   static int allocated = 0;
   static bispinor  *eigenvectors_bi = NULL;
   static double * eigenvls_bi = NULL;
-  static _Complex double * eigenvls_quda = NULL;
 
   /**********************
    * For Jacobi-Davidson 
@@ -128,12 +127,13 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
     eigenvectors_bi_= calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
     eigenvectors_bi = eigenvectors_bi_;
 #endif
+#ifdef TM_USE_QUDA
+    eigenvls_bi = (_Complex double *)malloc((*nr_of_eigenvalues)*sizeof(_Complex double));
+#else
     eigenvls_bi = (double*)malloc((*nr_of_eigenvalues)*sizeof(double));
+#endif
   }
 
-  eigenvls_quda = (_Complex double *)malloc((*nr_of_eigenvalues)*sizeof(_Complex double));
-  eigenvectors_bi_= calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
-
   /* compute eigenvalues */
 
   if((g_proc_id==0) && (g_debug_level > 4)) {
@@ -149,7 +149,7 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
     printf("Using external eigensolver on QUDA.\n");
   }
 
-  eigsolveQuda((*nr_of_eigenvalues), eigenvls_quda, prec, 
+  eigsolveQuda((*nr_of_eigenvalues), eigenvls_bi, prec, 
                 blocksize, blockwise, max_iterations, maxmin);
 
 #else
@@ -171,6 +171,6 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
   
   *nr_of_eigenvalues = converged;
 
-  returnvalue = eigenvls_quda[0];
+  returnvalue = eigenvls_bi[0];
   return(returnvalue);
 }

From f4cd2781de0d1794cf381f5b3c49b258032fe578 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 19 Apr 2023 11:59:13 +0200
Subject: [PATCH 07/49] revert changes in ndrat_monomial, eigenvalues_bi

---
 monomial/ndrat_monomial.c | 12 +++++-------
 solver/eigenvalues_bi.c   | 24 ------------------------
 2 files changed, 5 insertions(+), 31 deletions(-)

diff --git a/monomial/ndrat_monomial.c b/monomial/ndrat_monomial.c
index 1e37d9b51..1dd669b33 100644
--- a/monomial/ndrat_monomial.c
+++ b/monomial/ndrat_monomial.c
@@ -213,7 +213,11 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
     sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar);
     copy_32_sw_fields();
   }
-  
+  // we measure before the trajectory!
+  if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
+    if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
+    else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi);
+  }
 
   // the Gaussian distributed random fields
   tm_stopwatch_push(&g_timers, "random_energy0", "");
@@ -239,12 +243,6 @@ void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) {
   }
   mnl->solver_params.sdim = VOLUME/2;
   mnl->solver_params.rel_prec = g_relative_precision_flag;
-  
-  // we measure before the trajectory!
-  if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) {
-    if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi);
-    else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi);
-  }
   mnl->iter0 = solve_mms_nd_plus(g_chi_up_spinor_field, g_chi_dn_spinor_field,
                                  mnl->pf, mnl->pf2, &(mnl->solver_params) );
 
diff --git a/solver/eigenvalues_bi.c b/solver/eigenvalues_bi.c
index 3ab67df53..63d78e483 100644
--- a/solver/eigenvalues_bi.c
+++ b/solver/eigenvalues_bi.c
@@ -54,10 +54,6 @@
 #include "eigenvalues_bi.h"
 #include "operator/tm_operators_nd.h"
 
-#ifdef TM_USE_QUDA
-#  include "quda_interface.h"
-#endif
-
 
 double eigenvalues_bi(int * nr_of_eigenvalues,  
 		      const int max_iterations, const double precision,
@@ -127,11 +123,7 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
     eigenvectors_bi_= calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor));
     eigenvectors_bi = eigenvectors_bi_;
 #endif
-#ifdef TM_USE_QUDA
-    eigenvls_bi = (_Complex double *)malloc((*nr_of_eigenvalues)*sizeof(_Complex double));
-#else
     eigenvls_bi = (double*)malloc((*nr_of_eigenvalues)*sizeof(double));
-#endif
   }
 
   /* compute eigenvalues */
@@ -140,20 +132,6 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
     printf(" Values of   mu = %e     mubar = %e     eps = %e     precision = %e  \n \n", g_mu, g_mubar, g_epsbar, precision);
   }
 
-  /* For now, using the TM_USE_QUDA flag
-   * Ideally, one would use an operator flag
-   * like useExternalEigSolver. */
-#ifdef TM_USE_QUDA
-
-  if(g_proc_id == g_stdio_proc) {
-    printf("Using external eigensolver on QUDA.\n");
-  }
-
-  eigsolveQuda((*nr_of_eigenvalues), eigenvls_bi, prec, 
-                blocksize, blockwise, max_iterations, maxmin);
-
-#else
-
   /* here n and lda are equal, because Q_Qdagger_ND_BI does an internal */
   /* conversion to non _bi fields which are subject to xchange_fields   */
   /* so _bi fields do not need boundary                                 */
@@ -166,8 +144,6 @@ double eigenvalues_bi(int * nr_of_eigenvalues,
 	   &converged, (_Complex double*) eigenvectors_bi, eigenvls_bi,
 	   &returncode, maxmin, 1,
 	   Qsq);
-
-#endif
   
   *nr_of_eigenvalues = converged;
 

From af6f781fe0887423136d55ef98a9eb108f009284 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 19 Apr 2023 14:12:17 +0200
Subject: [PATCH 08/49] add input option UseExternalEigSolver

---
 default_input_values.h | 2 ++
 misc_types.h           | 6 ++++++
 read_input.l           | 8 ++++++++
 3 files changed, 16 insertions(+)

diff --git a/default_input_values.h b/default_input_values.h
index cf781368b..c71c9ae74 100644
--- a/default_input_values.h
+++ b/default_input_values.h
@@ -198,6 +198,8 @@
 
 #define _default_external_inverter 0
 
+#define _default_external_eigsolver 0
+
 #define _default_external_library 0
 
 #define _default_subprocess_flag 0
diff --git a/misc_types.h b/misc_types.h
index c7643f002..1695e04d8 100644
--- a/misc_types.h
+++ b/misc_types.h
@@ -89,6 +89,12 @@ typedef enum ExternalInverter_s {
   QPHIX_INVERTER
 } ExternalInverter;
 
+/* enumeration type for the external eigensolver */
+typedef enum ExternalEigSolver_s {
+  NO_EXT_EIGSOLVER = 0,
+  QUDA_EIGSOLVER
+} ExternalEigSolver;
+
 /* enumeration type for the external inverter */
 typedef enum ExternalLibrary_s {
   NO_EXT_LIB = 0,
diff --git a/read_input.l b/read_input.l
index b91b07428..dd2d0dddf 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2505,6 +2505,14 @@ static inline double fltlist_next_token(int * const list_end){
     mnl->rec_ev = a;
     if(myverbose!=0) printf("  Frequency for computing EV's set to %d in line %d monomial %d\n", mnl->rec_ev, line_of_file, current_monomial);
   }
+  {SPC}*UseExternalEigSolver{EQL}quda {
+    if(myverbose) printf("  Use Quda eigensolver line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->external_eigsolver = QUDA_EIGSOLVER;
+  }
+  {SPC}*UseExternalEigSolver{EQL}no {
+    if(myverbose) printf("  Do not use external eigensolver line %d monomial %d\n", line_of_file, current_monomial);
+    mnl->external_eigsolver = NO_EXT_EIGSOLVER;
+  }
 }
 <NDPOLYMONOMIAL,CLPOLYMONOMIAL>{
   {SPC}*MaxPtildeDegree{EQL}{DIGIT}+ {

From f50d533135ea0dbdcca2d8a15246c468dc6b09ea Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 19 Apr 2023 14:13:02 +0200
Subject: [PATCH 09/49] add parameter external_eigsolver to monomial

---
 monomial/monomial.c | 1 +
 monomial/monomial.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/monomial/monomial.c b/monomial/monomial.c
index 5c1c6f80f..3304609ce 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -114,6 +114,7 @@ int add_monomial(const int type) {
   monomial_list[no_monomials].solver_params.external_inverter = _default_external_inverter;
   monomial_list[no_monomials].solver_params.sloppy_precision = _default_operator_sloppy_precision_flag;
   monomial_list[no_monomials].external_library = _default_external_library;
+  monomial_list[no_monomials].external_eigsolver = _default_external_eigsolver;
   monomial_list[no_monomials].solver_params.refinement_precision = _default_operator_sloppy_precision_flag;
   monomial_list[no_monomials].even_odd_flag = _default_even_odd_flag;
   monomial_list[no_monomials].forcefactor = 1.;
diff --git a/monomial/monomial.h b/monomial/monomial.h
index 48f8fcb6d..9beb1ced7 100644
--- a/monomial/monomial.h
+++ b/monomial/monomial.h
@@ -112,6 +112,7 @@ typedef struct {
   double StildeMin, StildeMax;
   double EVMin, EVMax, EVMaxInv;
   ExternalLibrary external_library;
+  ExternalEigSolver external_eigsolver;
   double * MDPolyCoefs, * PtildeCoefs;
   /* rational approximation */
   rational_t rat;

From 40a1602e1a1841f14128689549cc8ff6abadd9c3 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 19 Apr 2023 14:14:11 +0200
Subject: [PATCH 10/49] eigsolveQuda is called based on mnl->external_eigsolver

---
 phmc.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/phmc.c b/phmc.c
index 71e19bf81..c2a8b5ca1 100644
--- a/phmc.c
+++ b/phmc.c
@@ -226,31 +226,27 @@ void phmc_compute_ev(const int trajectory_counter,
     printf("# Computing eigenvalues for heavy doublet\n");
   }
 
-#ifdef TM_USE_QUDA
-  /* Here we initialize QUDA */
-  initQudaforEig(mnl->solver_params.squared_solver_prec, mnl->solver_params.max_iter,
-                 mnl->solver_params.type, mnl->solver_params.rel_prec, 1, // we only support even-odd here
-                 mnl->solver_params.refinement_precision, mnl->solver_params.sloppy_precision, mnl->solver_params.compression_type);
-
-#endif
-
   no_eigenvalues = 1;
-  temp = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq);
-
-#ifdef TM_USE_QUDA
-  if(mnl->EVMax == 1.) {
-    temp = temp / mnl->StildeMax;
+  if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
+    temp = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0);
+    if(mnl->EVMax == 1.) {
+      temp = temp / mnl->StildeMax;
+    }
+  }else {
+    temp = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq);
   }
-#endif
+  
   
   no_eigenvalues = 1;
-  temp2 = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq);
-
-#ifdef TM_USE_QUDA
-  if(mnl->EVMax == 1.) {
-    temp2 = temp2 / mnl->StildeMax;
+  if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
+    temp2 = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1);
+    if(mnl->EVMax == 1.) {
+      temp2 = temp2 / mnl->StildeMax;
+    }
+  }else {
+    temp2 = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq);
   }
-#endif
+  
   
   if((g_proc_id == 0) && (g_debug_level > 1)) {
     printf("# %s: lowest eigenvalue end of trajectory %d = %e\n", 

From 3922b05c673efc85aaa9cad26a0b4f0489862229 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 19 Apr 2023 14:15:22 +0200
Subject: [PATCH 11/49] removed initQudaforEig, memory allocation for
 eigenvalues done in the interface, added timing and more comments

---
 quda_interface.c | 76 +++++++++++++++++++++---------------------------
 quda_interface.h |  7 +----
 2 files changed, 34 insertions(+), 49 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index b037f92cf..f47bcf648 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2327,47 +2327,6 @@ int invert_eo_quda_oneflavour_mshift(spinor ** const out,
   return(iterations);
 }
 
-void initQudaforEig(const double precision, const int max_iter, 
-                    const int solver_flag, const int rel_prec,
-                    const int even_odd_flag, const SloppyPrecision refinement_precision, 
-                    SloppyPrecision sloppy_precision, CompressionType compression) {
-  
-
-  // it returns if quda is already init
-  _initQuda();
-
-  if ( rel_prec )
-    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
-  else
-    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
-
-  inv_param.kappa = g_kappa;
-
-  // figure out which BC to use (theta, trivial...)
-  set_boundary_conditions(&compression, &gauge_param);
-  // set the sloppy precision of the mixed prec solver
-  set_sloppy_prec(sloppy_precision, refinement_precision, &gauge_param, &inv_param);
-  
-  // load gauge after setting precision
-  _loadGaugeQuda(compression);
-
-  _setTwoFlavourSolverParam(g_kappa,
-                            g_c_sw,
-                            g_mubar,
-                            g_epsbar,
-                            solver_flag,
-                            even_odd_flag,
-                            precision,
-                            max_iter,
-                            1 /*single_parity_solve */,
-                            1 /*always QpQm*/);
-
-  // QUDA applies the MMdag operator, we need QpQm^{-1) in the end
-  // so we want QUDA to use the MdagM operator
-  inv_param.dagger = QUDA_DAG_YES;
-
-}
-
 int invert_eo_quda_twoflavour_mshift(spinor ** const out_up, spinor ** const out_dn,
                                      spinor * const in_up, spinor * const in_dn,
                                      const double precision, const int max_iter,
@@ -2629,10 +2588,25 @@ Interface function for Eigensolver on Quda
 *********************************************************/
 
 
-void eigsolveQuda(int n, _Complex double *host_evals, double tol, 
+double eigsolveQuda(int n, double tol, 
                   int blksize, int blkwise,
                   int max_iterations,  int maxmin) {
+  
+  // check if QUDA is initialized
+  if (!quda_initialized) {
+    fatal_error("QUDA must be initialized.","eigsolveQuda");
+    return -1;
+  }
+
+  tm_stopwatch_push(&g_timers, __func__, "");
+
+  _Complex double * eigenvls;
+  double returnvalue;
 
+  // allocate memory for eigenvalues
+  eigenvls = (_Complex double *)malloc((n)*sizeof(_Complex double));
+
+  // create new eig_param
   eig_param = newQudaEigParam();
 
   eig_param.invert_param = &inv_param;
@@ -2692,16 +2666,32 @@ void eigsolveQuda(int n, _Complex double *host_evals, double tol,
   eig_param.invert_param->cuda_prec_eigensolver = inv_param.cuda_prec;
   eig_param.invert_param->clover_cuda_prec_eigensolver = inv_param.cuda_prec;
 
+  /* At the moment, the eigenvalues and eigenvectors are neither 
+   * written to or read from disk, but if necessary, can be added
+   * as a feature in future, by setting the following filenames */
   strncpy(eig_param.vec_outfile,"",256);
   strncpy(eig_param.vec_infile,"",256);
   
 
+  /* The size of eigenvector search space and
+   * the number of required converged eigenvectors 
+   * is both set to n */
   eig_param.n_conv = n;
   eig_param.n_ev = n;
+  /* The size of the Krylov space is set to 96.
+   * From my understanding, QUDA automatically scales
+   * this search space, however more testing on this 
+   * might be necessary */
   eig_param.n_kr = 96;
 
   eig_param.max_restarts = max_iterations;
 
-  eigensolveQuda(NULL, host_evals, &eig_param);
+  eigensolveQuda(NULL, eigenvls, &eig_param);
+
+  returnvalue = eigenvls[0];
+
+  tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
+
+  return(returnvalue);
 
 }
diff --git a/quda_interface.h b/quda_interface.h
index 0babe4a27..c163c489f 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -175,13 +175,8 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 
-void eigsolveQuda(int n, _Complex double *host_evals, double tol, 
+double eigsolveQuda(int n, double tol, 
                   int blksize, int blkwise,
                   int max_iterations,  int maxmin);
 
-void initQudaforEig(const double precision, const int max_iter, 
-                    const int solver_flag, const int rel_prec,
-                    const int even_odd_flag, const SloppyPrecision refinement_precision, 
-                    SloppyPrecision sloppy_precision, CompressionType compression);
-
 #endif /* QUDA_INTERFACE_H_ */

From c308510f9f75d6e247bd5d5f35642681a749b23d Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 26 Apr 2023 14:41:51 +0200
Subject: [PATCH 12/49] QUDA and monomial parameters are initialized within the
 interface

---
 phmc.c           | 19 +++++++++++++++----
 quda_interface.c | 37 +++++++++++++++++++++++++++++++------
 quda_interface.h |  7 ++++---
 3 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/phmc.c b/phmc.c
index c2a8b5ca1..df36e8c65 100644
--- a/phmc.c
+++ b/phmc.c
@@ -25,6 +25,7 @@
 #include <math.h>
 #include <string.h>
 #include <time.h>
+#include <float.h>
 
 #include "global.h"
 
@@ -228,8 +229,13 @@ void phmc_compute_ev(const int trajectory_counter,
 
   no_eigenvalues = 1;
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
-    temp = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0);
-    if(mnl->EVMax == 1.) {
+    temp = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0,
+                        mnl->accprec, mnl->maxiter, mnl->solver, g_relative_precision_flag,
+                                                       1, // we only support even-odd here
+                                                       mnl->solver_params.refinement_precision,
+                                                       mnl->solver_params.sloppy_precision,
+                                                       mnl->solver_params.compression_type);
+    if( fabs(mnl->EVMax - 1) < 2*DBL_EPSILON ) {
       temp = temp / mnl->StildeMax;
     }
   }else {
@@ -239,8 +245,13 @@ void phmc_compute_ev(const int trajectory_counter,
   
   no_eigenvalues = 1;
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
-    temp2 = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1);
-    if(mnl->EVMax == 1.) {
+    temp2 = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1,
+                         mnl->accprec, mnl->maxiter, mnl->solver, g_relative_precision_flag,
+                                                       1, // we only support even-odd here
+                                                       mnl->solver_params.refinement_precision,
+                                                       mnl->solver_params.sloppy_precision,
+                                                       mnl->solver_params.compression_type);
+    if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
       temp2 = temp2 / mnl->StildeMax;
     }
   }else {
diff --git a/quda_interface.c b/quda_interface.c
index f47bcf648..7a23776f0 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2588,17 +2588,42 @@ Interface function for Eigensolver on Quda
 *********************************************************/
 
 
-double eigsolveQuda(int n, double tol, 
-                  int blksize, int blkwise,
-                  int max_iterations,  int maxmin) {
+double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
+                    const double precision, const int max_iter, const int solver_flag, const int rel_prec,
+                    const int even_odd_flag, const SloppyPrecision refinement_precision,
+                    SloppyPrecision sloppy_precision, CompressionType compression) {
+
+  tm_stopwatch_push(&g_timers, __func__, "");
   
   // check if QUDA is initialized
   if (!quda_initialized) {
-    fatal_error("QUDA must be initialized.","eigsolveQuda");
-    return -1;
+    // it returns if quda is already init
+    _initQuda();
   }
 
-  tm_stopwatch_push(&g_timers, __func__, "");
+    if ( rel_prec )
+      inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+    else
+      inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
+
+    inv_param.kappa = g_kappa;
+    
+    // figure out which BC tu use (theta, trivial...)
+    set_boundary_conditions(&compression, &gauge_param);
+
+    set_sloppy_prec(sloppy_precision, refinement_precision, &gauge_param, &inv_param);
+
+  // load gauge after setting precision
+  _loadGaugeQuda(compression);
+
+  _setTwoFlavourSolverParam(g_kappa, g_c_sw, g_mubar, g_epsbar, solver_flag, even_odd_flag, precision, max_iter,
+                            1 /*single_parity_solve */,
+                            1 /*always QpQm*/);
+
+  // QUDA applies the MMdag operator, we need QpQm^{-1) in the end
+  // so we want QUDA to use the MdagM operator
+  inv_param.dagger = QUDA_DAG_YES;
+
 
   _Complex double * eigenvls;
   double returnvalue;
diff --git a/quda_interface.h b/quda_interface.h
index c163c489f..5320fd5ea 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -175,8 +175,9 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 
-double eigsolveQuda(int n, double tol, 
-                  int blksize, int blkwise,
-                  int max_iterations,  int maxmin);
+double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
+                    const double precision, const int max_iter, const int solver_flag, const int rel_prec,
+                    const int even_odd_flag, const SloppyPrecision refinement_precision,
+                    SloppyPrecision sloppy_precision, CompressionType compression);
 
 #endif /* QUDA_INTERFACE_H_ */

From a06e083aa61d84420acb8a5968a01f3256b7ee9f Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 26 Apr 2023 17:20:57 +0200
Subject: [PATCH 13/49] removed unnecessary quda_initialized check

---
 quda_interface.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 7a23776f0..bd2ff73fc 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2595,11 +2595,9 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
 
   tm_stopwatch_push(&g_timers, __func__, "");
   
-  // check if QUDA is initialized
-  if (!quda_initialized) {
-    // it returns if quda is already init
-    _initQuda();
-  }
+  
+  // it returns if quda is already init
+  _initQuda();
 
     if ( rel_prec )
       inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;

From 562e9d993f97cc35a0399cb65e7b42326abadfde Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 3 May 2023 18:36:31 +0200
Subject: [PATCH 14/49] add some more documentation of the various QUDA MG
 parameters (still incomplete)

---
 doc/quda.tex | 70 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 18 deletions(-)

diff --git a/doc/quda.tex b/doc/quda.tex
index 85fa14095..ffa2164c7 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -114,26 +114,20 @@ \subsubsection{QUDA-MG interface}
 EndOperator
 \end{verbatim}
 
-The MG setup can be tuned using the following parameters in the \texttt{BeginExternalInverter QUDA} section:
+\paragraph{Basic MG parameters:} The MG setup can be tuned using the following parameters in the \texttt{BeginExternalInverter QUDA} section:
 \begin{itemize}
-  \item{ \texttt{MGNumberOfLevels}: number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default $3$) }
+  \item{ \texttt{MGNumberOfLevels}: number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default: $3$) }
   \item{ \texttt{MGSetupSolver}: solver used for generating null vectors. \texttt{CG} or \texttt{BiCGstab} (default \texttt{CG}). Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks. }
-  \item{ \texttt{MGSetupSolverTolerance}: relative target residual (unsquared!) during setup phase. (positive float, default $1\cdot10^{-6}$) }
-  \item{ \texttt{MGSetupMaxSolverIterations}: maximum number of iterations during setup phase. (positive integer, default $1000$) }
-  \item{ \texttt{MGCoarseSolverTolerance}: unsquared relative target residual on the coarse grids. (positive float, default $0.25$) }
-  \item{ \texttt{MGNumberOfVectors}: number of null vectors to compute on a per-level basis. (possible values $\left[ 24, 32 \right]$, default $24$)}
-  \item{ \texttt{MGCoarseMaxSolveriterations}: maximum number of iterations on coarse grids. (positive integer, default $75$) }
-  \item{ \texttt{MGEnableSizeThreeBlocks}: By default, QUDA has limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This requires you to instantiate the necessary block sizes in QUDA (see comments below). (boolean \emph{yes} or \emph{no}, default \emph{no}) }
-  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set for a given lattice dimension, the automatic blocking algorithm for that dimension is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of integers, for a three level solver, for example, this needs to be specified for the first and second level)} 
-  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on all levels. (positive float, default $0.25$) }
-  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction. (zero or positive integer, default $0$)}
-  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation. (zero or positive integer, default $4$)}
-  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor. (positive float, default $0.85$)}
-  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. (positive float, usually $ > 1.0$, default $8.0$ from the second level upwards).}
-  \item{ \texttt{MGRunVerify}: Check GPU coarse operators against CPU coarse operators and verify Galerkin projectors during setup phase. This is usually fast enough to always be performed, although sometimes it seems to fail even though the setup works fine. (\emph{yes} or \emph{no}, default \emph{yes}) } 
+  \item{ \texttt{MGSetupSolverTolerance}: relative target residual (unsquared!) during setup phase on a per-level basis. (comma-separated list of positive floats, default: $1\cdot10^{-6}, 1\cdot10^{-6}, \ldots$) }
+  \item{ \texttt{MGSetupMaxSolverIterations}: maximum number of iterations during setup phase on a per-level basis. (comma-separated list of positive integers, default: $1000, 1000, \ldots$) }
+  \item{ \texttt{MGCoarseSolverTolerance}: unsquared relative target residual on the coarse grids on a per-level basis. (comma-separated list of positive floats, default: $0.25, 0.25, \ldots$) }
+  \item{ \texttt{MGNumberOfVectors}: number of null vectors to compute on a per-level basis. (comma-separated list of integers, possible values $\left[ 24, 32 \right]$, default: $24, 24, \ldots$)}
+  \item{ \texttt{MGCoarseMaxSolveriterations}: maximum number of iterations on coarse grids on a per-level basis. (comma-separated list of positive integers, default: $75, 75, \ldots$) }
+  \item{ \texttt{MGEnableSizeThreeBlocks}: Historically and by default, QUDA has had limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This may require you to instantiate the necessary block sizes in QUDA (see comments below). In recent (2022 and later) commits of the QUDA \texttt{develop} branch, this can usually be set to \emph{yes} without having to modify QUDA. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set for a given lattice dimension, the automatic blocking algorithm for that dimension is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of positive integers, for a three level solver, for example, two numbers needs to be specified: aggregate size on the finest and intermediate level)}
 \end{itemize}
 
-If no blocking is specified manually, the aggregation parameters are set automatically as follows:
+\paragraph*{Automatic blocking:} If no blocking is specified manually, the aggregation parameters are set automatically as follows:
 \begin{itemize}
   \item{ A default block size of $4$ is attempted if the MPI-partitioned fine or aggregate lattice extent is larger or equal to $16$ lattice sites. }
   \item{ If the number of aggregate lattice sites in a given direction is even and smaller than $16$, a block size of $2$ is used. }
@@ -141,13 +135,53 @@ \subsubsection{QUDA-MG interface}
   \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. }
 \end{itemize}
 
+\paragraph{Further MG parameters:}
+\begin{itemize}
+  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on a per-level basis. (comma-separated list of positive floats, default: $0.25, 0.25, \ldots$) }
+  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction on a per-level basis. (comma-separated list of zero or positive integers, default: $0, 0, \ldots$)}
+  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation on a per-level basis. (comma-separated list of zero or positive integers, default: $4, 4, \ldots$)}
+  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor on a per-level basis. (comma-separated list of positive floats, default: $0.85, 0.85, \ldots$)}
+  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid solves. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but sometimes it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. When using coarse-grid deflation (see \texttt{MGEigSolverRequireConvergence}), this should usually be set to $1.0$ on all levels. (comma-separated list of positive floats, usually $ > 1.0$, default $8.0$ from the second level upwards).}
+  \item{ \texttt{MGReuseSetupMuThreshold}: When the twisted quark mass is changed between solves using the MG solver, the MG setup is usually \emph{updated} for this new $\mu$ value. One can attempt to reuse the MG setup for solves with different mu values up to this threshold, i.e., when the condition $x < 2\kappa\cdot|\mu_\mathrm{old} - \mu_\mathrm{new}|$ holds. (positive float, default: \texttt{2*DBL\_EPSILON})}
+  \item{ \texttt{MGRefreshSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be regularly refreshed by running a few iterations of the setup solver on the current set of approximate null vectors in order to evolve these with the changing gauge field. A good rule of thumb is to perform this setup refresh about twice per coarsest time step. In other words: for a trajectory length $\tau$ and $N$ integration steps on the coarsest time scale, the refresh should be performed at intervals of $(\tau/(2N)-\epsilon)$ MDUs, where $\epsilon$ is a small number to make sure that the threshold is hit at every half-step of the integrator. (positive float, default: \texttt{2*DBL\_EPSILON})}
+  \item{ \texttt{MGRefreshSetupMaxSolverIterations}: Number of setup iterations to be performed on a per-level basis when a setup refresh is required, see also \texttt{MGRefreshSetupMDUThreshold} above. (comma-separated list of positive integers, default: $0, 0, \ldots$}
+  \item{ \texttt{MGResetSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be reset (instead of being refreshed) if the gauge configuration changes significantly at some point. This is the case, for example, when a trajectory is rejected. This value must always be set and a possible choice is to set it to the trajectory length. Note that there is an interaction with the preprocessor constant \texttt{TM\_GAUGE\_PROPAGATE\_THRESHOLD}, the default value of which is $3.0$. This is because when a trajectory is rejected, an internal variable which tracks the state of the gauge field is incremented by \texttt{TM\_GAUGE\_PROPAGATE\_THRESHOLD} to ensure that everything that depends on the gauge field is properly updated. This means that if the trajectory length $\tau \geq 3.0$, one should ignore the recommendation and set $x < 3.0$. In general, any value significantly larger than \texttt{MGRefreshSetupMDUThreshold} should be fine. (positive float, default: \texttt{2*DBL\_EPSILON}) }
+  \item{ \texttt{MGRunVerify}: Check GPU coarse-grid operators against CPU reference implementation and verify Galerkin projectors during setup phase. This is usually fast enough to always be performed, although sometimes it seems to fail even though the setup works fine. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+  \item{ \texttt{MGRunObliqueProjectionCheck}: Measure how well the null vector subspace adjusts the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+  \item{ \texttt{MGLowModeCheck}: Measure how well the null vector subspace overlaps with the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
+\end{itemize}
+
+The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass and the used ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids.
+
+\paragraph{Coarse-grid-deflated solver:}
+Especially with twisted mass quarks, the coarse-grid operator is very poorly conditioned. One way of improving the convergence of the solver is the scaling of the $\mu$ parameter on the coarsest and intermediate grid as described in \texttt{MGCoarseMuFactor} above. A different and more potent way which pays off for propagator inversions (but is not suitable for the HMC) is to use coarse-grid deflation. In this case, the first $n_\mathrm{ev}$ eigenvectors of the coarse-grid operator are computed and the resulting deflation subspace is used to precondition the system. In this case, experience seems to indicate that \texttt{MGCoarseMuFactor} should be set to $1.0$ everywhere as it is no longer required to improve convergence. The options of QUDA's eigensolver are set via the following set of parameters:
+\begin{itemize}
+  \item{ \texttt{MGUseEigSolver}: Whether or not the eigensolver should be run for the operator on a per-level basis. The most common use-case would be to enable the eigensolver on the coarsest grid. (comma-separated list of booleans \emph{yes} or \emph{no}, default: \emph{no, no, \ldots})}
+  \item{ \texttt{MGEigSolverType}: Type of eigensolver to use on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{tr\_lanczos} (truncated Lanczos)
+      \item \texttt{ir\_arnoldi} (implicitly-restarted Arnoldi)
+    \end{itemize}
+    (comma-separated list of strings, default: \texttt{tr\_lanczos, tr\_lanczos, \ldots})}
+  \item{ \texttt{MGEigSolverSpectrum}: Which part of the spectrum the eigensolver should focus on specified on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{smallest\_real} (smallest eigenvalues based on their real part)
+      \item \texttt{largest\_real} (largest eigenvalues based on their real part)
+      \item \texttt{smallest\_imaginary} (smallest eigenvalues based on their imaginary part)
+      \item \texttt{largest\_imaginary} (largest eigenvalues based on their imaginary part)
+      \item \texttt{smallest\_modulus} (smallest eigenvalues in modulus)
+      \item \texttt{largest\_modulus} (largest eigenvalues in modulus)
+    \end{itemize}
+    For the purpose of deflating the coarse-grid system, the usual choice is \texttt{smallest\_real}.
+    (comma-separated list of strings, default: \texttt{smallest\_real, smallest\_real, \ldots})}
+\end{itemize}
 
 Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted.
 
 A typical MG setup might look like this for twisted mass clover quarks: 
 
 \begin{verbatim}
-BeginExternalInverter QUDA
+{BeginExternalInverter QUDA
   MGNumberOfLevels = 3
   MGSetupSolver = cg
   MGSetupSolverTolerance = 1e-6
@@ -175,7 +209,7 @@ \subsubsection{QUDA-MG interface}
   MGBlockSizesZ = 6, 4
   MGBlockSizesT = 6, 4
   MGSetupSolver = cg
-  MGSetupSolverTolerance = 1e-6
+  MGSetupSolverTolerance = 1e-6, 1e-6, 
   MGSetupMaxSolverIterations = 1000
   MGCoarseSolverTolerance = 0.25
   MGCoarseSolverIterations = 75

From 81e1d351030ec13c3822180a862f690cffb744d6 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Thu, 4 May 2023 16:33:13 +0200
Subject: [PATCH 15/49] to be safe, fix null vector MG setup type in QUDA
 interface

---
 quda_interface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/quda_interface.c b/quda_interface.c
index 9922ef3ae..2f6a4da51 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -1863,6 +1863,8 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
   QudaInvertParam * mg_inv_param = mg_param->invert_param;
   _setMGInvertParam(mg_inv_param, &inv_param);
 
+  mg_param->setup_type = QUDA_NULL_VECTOR_SETUP;
+
   mg_param->preserve_deflation = quda_input.mg_eig_preserve_deflation;
 
   mg_param->n_level = quda_input.mg_n_level;

From 26fcdd3d8046b30920e4ae1e6d39294f9709d9b9 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Thu, 4 May 2023 16:34:35 +0200
Subject: [PATCH 16/49] allow CA solvers to be used for the QUDA-MG setup,
 incidentally add explicit CGNE and CGNR options

---
 quda_dummy_types.h    |  5 +++++
 read_input.l          | 25 ++++++++++++++++++++++++-
 solver/solver_types.h |  5 +++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/quda_dummy_types.h b/quda_dummy_types.h
index b689050dc..668237689 100644
--- a/quda_dummy_types.h
+++ b/quda_dummy_types.h
@@ -39,6 +39,11 @@ typedef enum QudaInverterType_s {
  QUDA_CG_INVERTER = CG,
  QUDA_MR_INVERTER = MR,
  QUDA_GCR_INVERTER = GCR,
+ QUDA_CGNE_INVERTER = CGNE,
+ QUDA_CGNR_INVERTER = CGNR,
+ QUDA_CA_CG_INVERTER = CA_CG,
+ QUDA_CA_CGNE_INVERTER = CA_CGNE, 
+ QUDA_CA_CGNR_INVERTER = CA_CGNR,
  QUDA_CA_GCR_INVERTER = CA_GCR
 } QudaInverterType;
 
diff --git a/read_input.l b/read_input.l
index c4e8c0764..102c40b72 100644
--- a/read_input.l
+++ b/read_input.l
@@ -1733,10 +1733,33 @@ static inline double fltlist_next_token(int * const list_end){
     quda_input.mg_setup_2kappamu = c;
     if(myverbose) printf("  MGSetup2KappaMu set to %e line %d\n", quda_input.mg_setup_2kappamu, line_of_file);
   }
+  /* TODO: the MGSetupSolver should be set on a per-level basis
+           allowing communication-avoiding solvers to be used
+           on the coarser grids */
   {SPC}*MGSetupSolver{EQL}cg {
     quda_input.mg_setup_inv_type = QUDA_CG_INVERTER;
     if(myverbose) printf("  MGSetupSolver set to CG line %d\n", line_of_file);
   }
+  {SPC}*MGSetupSolver{EQL}cgne {
+    quda_input.mg_setup_inv_type = QUDA_CGNE_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CGNE line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cgnr {
+    quda_input.mg_setup_inv_type = QUDA_CGNE_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CGNR line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cacg {
+    quda_input.mg_setup_inv_type = QUDA_CA_CG_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CA-CG line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cacgne {
+    quda_input.mg_setup_inv_type = QUDA_CA_CGNE_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CA-CGNE line %d\n", line_of_file);
+  }
+  {SPC}*MGSetupSolver{EQL}cacgnr {
+    quda_input.mg_setup_inv_type = QUDA_CA_CGNR_INVERTER;
+    if(myverbose) printf("  MGSetupSolver set to CA-CGNR line %d\n", line_of_file);
+  }
   {SPC}*MGSetupSolver{EQL}bicgstab {
     quda_input.mg_setup_inv_type = QUDA_BICGSTAB_INVERTER;
     if(myverbose) printf("  MGSetupSolver set to BiCGstab line %d\n", line_of_file);
@@ -3827,7 +3850,7 @@ int read_input(char * conf_file){
     quda_input.mg_n_vec[level] = _default_quda_mg_n_vec;
     quda_input.mg_mu_factor[level] = 1.0;
     quda_input.mg_coarse_solver_type[level] = QUDA_GCR_INVERTER;
-    quda_input.mg_smoother_type[level] = QUDA_MR_INVERTER ;
+    quda_input.mg_smoother_type[level] = QUDA_CA_GCR_INVERTER;
 
     quda_input.mg_use_eig_solver[level] = QUDA_BOOLEAN_NO;
     quda_input.mg_eig_preserve_deflation = QUDA_BOOLEAN_NO;
diff --git a/solver/solver_types.h b/solver/solver_types.h
index 0538481ab..7cd1a52e1 100644
--- a/solver/solver_types.h
+++ b/solver/solver_types.h
@@ -48,6 +48,11 @@ typedef enum SOLVER_TYPE {
  MIXEDBICGSTAB,
  DUMMYHERMTEST,
  CA_GCR,
+ CGNE,
+ CGNR,
+ CA_CG,
+ CA_CGNE,
+ CA_CGNR,
  INVALID_SOLVER
 } SOLVER_TYPE;
 

From 879bd41cb3d93b877cedb19c45b19a95bb347d3e Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Thu, 4 May 2023 18:02:29 +0200
Subject: [PATCH 17/49] more progress on QUDA-MG documentation

---
 doc/quda.tex | 64 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/doc/quda.tex b/doc/quda.tex
index ffa2164c7..6323c07ba 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -116,32 +116,44 @@ \subsubsection{QUDA-MG interface}
 
 \paragraph{Basic MG parameters:} The MG setup can be tuned using the following parameters in the \texttt{BeginExternalInverter QUDA} section:
 \begin{itemize}
-  \item{ \texttt{MGNumberOfLevels}: number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default: $3$) }
-  \item{ \texttt{MGSetupSolver}: solver used for generating null vectors. \texttt{CG} or \texttt{BiCGstab} (default \texttt{CG}). Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks. }
+  \item{ \texttt{MGNumberOfLevels}: Number of levels to be used in the MG, $3$ is usually ideal but $2$ can be similarly efficient depending on the quark mass (positive integer, default: $3$) }
+  \item{ \texttt{MGSetupSolver}: Solver used for generating null vectors. Possible values:
+    \begin{itemize}
+      \item \texttt{CG} (conjugate gradient)
+      \item \texttt{CGNE} (conjugate gradient normal equations)
+      \item \texttt{CGNR} (conjugate gradient normal residuals)
+      \item \texttt{CACG} (communication-avoiding conjugate gradient)
+      \item \texttt{CACGNE} (communication-avoiding conjugate gradient normal equations)
+      \item \texttt{CACGNR} (communication-avoiding conjugate gradient normal residuals)
+      \item \texttt{BiCGstab} (stabilised bi-conjugate gradient)
+    \end{itemize}
+    Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks while \texttt{CG} should always be used for twised mass or twisted clover quarks. \texttt{CACG} may be beneficial at the strong-scaling limit. At the time of writing (May 2023), this cannot be set on a per-level basis so the effect of the communication-avoiding setup solver on the setup time may be limited or even detrimental.
+        (default: \texttt{CG})}
   \item{ \texttt{MGSetupSolverTolerance}: relative target residual (unsquared!) during setup phase on a per-level basis. (comma-separated list of positive floats, default: $1\cdot10^{-6}, 1\cdot10^{-6}, \ldots$) }
   \item{ \texttt{MGSetupMaxSolverIterations}: maximum number of iterations during setup phase on a per-level basis. (comma-separated list of positive integers, default: $1000, 1000, \ldots$) }
   \item{ \texttt{MGCoarseSolverTolerance}: unsquared relative target residual on the coarse grids on a per-level basis. (comma-separated list of positive floats, default: $0.25, 0.25, \ldots$) }
   \item{ \texttt{MGNumberOfVectors}: number of null vectors to compute on a per-level basis. (comma-separated list of integers, possible values $\left[ 24, 32 \right]$, default: $24, 24, \ldots$)}
   \item{ \texttt{MGCoarseMaxSolveriterations}: maximum number of iterations on coarse grids on a per-level basis. (comma-separated list of positive integers, default: $75, 75, \ldots$) }
   \item{ \texttt{MGEnableSizeThreeBlocks}: Historically and by default, QUDA has had limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This may require you to instantiate the necessary block sizes in QUDA (see comments below). In recent (2022 and later) commits of the QUDA \texttt{develop} branch, this can usually be set to \emph{yes} without having to modify QUDA. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
-  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set for a given lattice dimension, the automatic blocking algorithm for that dimension is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of positive integers, for a three level solver, for example, two numbers needs to be specified: aggregate size on the finest and intermediate level)}
+  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set non-zero for a given lattice dimension and a given level, the automatic blocking algorithm for that dimension and level is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of positive integers, for a three level solver, for example, two numbers needs to be specified: aggregate size on the finest and intermediate level, default: \texttt{0, 0})}
 \end{itemize}
 
 \paragraph*{Automatic blocking:} If no blocking is specified manually, the aggregation parameters are set automatically as follows:
 \begin{itemize}
-  \item{ A default block size of $4$ is attempted if the MPI-partitioned fine or aggregate lattice extent is larger or equal to $16$ lattice sites. }
-  \item{ If the number of aggregate lattice sites in a given direction is even and smaller than $16$, a block size of $2$ is used. }
-  \item{ The option \texttt{MGEnableSizeThreeBlocks} can be set to \texttt{yes}. Then, for levels coarser than the fine grid, extents smaller than $16$ and divisible by $3$, a block size of $3$ will be used. This will almost certainly require the addition of instantiations of block sizes to QUDA in the restrictor and transfer operator. (\texttt{lib/restrictor.cu} and \texttt{lib/transfer\_util.cu}) }
-  \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. }
+  \item{ A default block size of $4$ is attempted if the MPI-partitioned local lattice extent on the current level is even and larger than or equal to $16$ lattice sites. }
+  \item{ If the number of lattice sites in a given direction on the current level is even and smaller than $16$, a block size of $2$ is used. }
+  \item{ The option \texttt{MGEnableSizeThreeBlocks} can be set to \texttt{yes}. Then, for levels coarser than the fine grid, extents smaller than $16$ and divisible by $3$, a block size of $3$ will be used. This might require the addition of instantiations of block sizes to QUDA in the restrictor and transfer operator. (\texttt{lib/restrictor.cu} and \texttt{lib/transfer\_util.cu}) }
+  \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. This means that if you need blocks of $5$ or $7$ for geometry reasons, you will need to choose a setup manually.}
 \end{itemize}
 
-\paragraph{Further MG parameters:}
+\paragraph{Further basic MG parameters:}
 \begin{itemize}
   \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on a per-level basis. (comma-separated list of positive floats, default: $0.25, 0.25, \ldots$) }
   \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction on a per-level basis. (comma-separated list of zero or positive integers, default: $0, 0, \ldots$)}
   \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation on a per-level basis. (comma-separated list of zero or positive integers, default: $4, 4, \ldots$)}
   \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor on a per-level basis. (comma-separated list of positive floats, default: $0.85, 0.85, \ldots$)}
   \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid solves. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but sometimes it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. When using coarse-grid deflation (see \texttt{MGEigSolverRequireConvergence}), this should usually be set to $1.0$ on all levels. (comma-separated list of positive floats, usually $ > 1.0$, default $8.0$ from the second level upwards).}
+  \item{ \texttt{MGSetup2KappaMu}: The value of $2\kappa\mu$ which should be used during the MG setup process. This is important in the HMC for standard twisted mass fermions, for example, because the setup should always be performed with the smallest quark mass to be employed in a simulation and it might be that a monomial with a heavier twisted quark mass is the first to call to MG and to thus trigger the setup. Generally this is set to the target light twisted quark mass. Setting this to $0.0$ implies that it is ignored. (float, default: $0.0$) }
   \item{ \texttt{MGReuseSetupMuThreshold}: When the twisted quark mass is changed between solves using the MG solver, the MG setup is usually \emph{updated} for this new $\mu$ value. One can attempt to reuse the MG setup for solves with different mu values up to this threshold, i.e., when the condition $x < 2\kappa\cdot|\mu_\mathrm{old} - \mu_\mathrm{new}|$ holds. (positive float, default: \texttt{2*DBL\_EPSILON})}
   \item{ \texttt{MGRefreshSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be regularly refreshed by running a few iterations of the setup solver on the current set of approximate null vectors in order to evolve these with the changing gauge field. A good rule of thumb is to perform this setup refresh about twice per coarsest time step. In other words: for a trajectory length $\tau$ and $N$ integration steps on the coarsest time scale, the refresh should be performed at intervals of $(\tau/(2N)-\epsilon)$ MDUs, where $\epsilon$ is a small number to make sure that the threshold is hit at every half-step of the integrator. (positive float, default: \texttt{2*DBL\_EPSILON})}
   \item{ \texttt{MGRefreshSetupMaxSolverIterations}: Number of setup iterations to be performed on a per-level basis when a setup refresh is required, see also \texttt{MGRefreshSetupMDUThreshold} above. (comma-separated list of positive integers, default: $0, 0, \ldots$}
@@ -151,7 +163,41 @@ \subsubsection{QUDA-MG interface}
   \item{ \texttt{MGLowModeCheck}: Measure how well the null vector subspace overlaps with the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
 \end{itemize}
 
-The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass and the used ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids.
+The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about one to two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass and the used ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids. This kind of compensation will lead to a setup which is algorithmically less efficient (more matrix-vector products) but may have a lower overall time-to-solution.
+
+\paragraph{Advanced MG parameters:}
+\begin{itemize}
+  \item{ \texttt{MGCoarseSolverType}: The type of solver to be used on the intermediate and coarse grids. While this has to be specified for all levels, it will be ignored on the fine grid. Possible values:
+    \begin{itemize}
+      \item \texttt{cagcr} (communication-avoiding generalised conjugate residual)
+      \item \texttt{gcr} (generalised conjugate residual)
+      \item \texttt{mr} (minimal residual)
+    \end{itemize}
+    The recommendation is to use \texttt{gcr} on all intermediate and \texttt{cagcr} on the coarsest level.
+    (comma-separated list of strings, default: \texttt{gcr} on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisType}: The type of basis to use for the communication-avoiding solver on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} for the given level. Possible values:
+  \begin{itemize}
+    \item \texttt{power} (power basis)
+    \item \texttt{chebyshev} (Chebyshev basis)
+  \end{itemize}
+    (comma-separated list of strings, default: \texttt{power} on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisSize}: The size of the polynomial basis to use if a communication-avoiding solver is used on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} for the given level. Values larger than $4$ require \texttt{chebyshev} to be set for the corresponding \texttt{MGCoarseSolverCABasisType}. (comma-separated list of positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisLambdaMin}: Minimum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType}. The default value of $0.0$ is recommened for safety. (comma-separated list of postitive floats, default: $0.0$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisLambdaMax}: Maximum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType}. The default value of $-1.0$ triggers power iterations which estimate this automatically. (comma-separated list of positive floats, default: $-1.0$ on all levels)}
+  \item{ \texttt{MGSetupCABasisType}: The type of polynomial basis to use if a communication-avoiding solver is used in the setup on a per-level basis. Possible values:
+  \begin{itemize}
+    \item \texttt{power} (power basis)
+    \item \texttt{chebyshev} (Chebyshev)
+  \end{itemize}
+    The recommendation is to use \texttt{chebyshev} for stability reasons. (comma-separated list of strings, default: \texttt{power} on all levels)}
+  \item{ \texttt{MGSmootherType}: The type of algorithm used as a smoother on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{cagcr} (communication-avoiding generalised conjugate residual)
+      \item \texttt{gcr} (generalised conjugate residual)
+      \item \texttt{mr} (minimal residual)
+    \end{itemize}
+    (comma-separated list of strings, default: \texttt{cagcr} on all levels)}
+\end{itemize}
 
 \paragraph{Coarse-grid-deflated solver:}
 Especially with twisted mass quarks, the coarse-grid operator is very poorly conditioned. One way of improving the convergence of the solver is the scaling of the $\mu$ parameter on the coarsest and intermediate grid as described in \texttt{MGCoarseMuFactor} above. A different and more potent way which pays off for propagator inversions (but is not suitable for the HMC) is to use coarse-grid deflation. In this case, the first $n_\mathrm{ev}$ eigenvectors of the coarse-grid operator are computed and the resulting deflation subspace is used to precondition the system. In this case, experience seems to indicate that \texttt{MGCoarseMuFactor} should be set to $1.0$ everywhere as it is no longer required to improve convergence. The options of QUDA's eigensolver are set via the following set of parameters:

From 232ec6ea6e1a661b88c43d55825c1284c55f592b Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Fri, 5 May 2023 12:22:44 +0200
Subject: [PATCH 18/49] add functionality to set smoother CA parameters

---
 quda_interface.c |  5 +++++
 quda_types.h     |  5 +++++
 read_input.l     | 17 +++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/quda_interface.c b/quda_interface.c
index 2f6a4da51..6e13002d9 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2013,6 +2013,11 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
     mg_param->coarse_solver_ca_lambda_min[level] = quda_input.mg_coarse_solver_ca_lambda_min[level];
     mg_param->coarse_solver_ca_lambda_max[level] = quda_input.mg_coarse_solver_ca_lambda_max[level];
     
+    mg_param->smoother_solver_ca_basis[level]      = quda_input.mg_smoother_solver_ca_basis[level];
+    mg_param->smoother_solver_ca_basis_size[level] = quda_input.mg_smoother_solver_ca_basis_size[level];
+    mg_param->smoother_solver_ca_lambda_min[level] = quda_input.mg_smoother_solver_ca_lambda_min[level];
+    mg_param->smoother_solver_ca_lambda_max[level] = quda_input.mg_smoother_solver_ca_lambda_max[level];
+   
 
     // set the MG EigSolver parameters, almost equivalent to
     // setEigParam from QUDA's multigrid_invert_test, except
diff --git a/quda_types.h b/quda_types.h
index 009c70925..a508fa683 100644
--- a/quda_types.h
+++ b/quda_types.h
@@ -91,6 +91,11 @@ typedef struct tm_QudaParams_t {
   int                  mg_coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL];
   double               mg_coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL];
   double               mg_coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL];
+  
+  QudaCABasis          mg_smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL];
+  int                  mg_smoother_solver_ca_basis_size[QUDA_MAX_MG_LEVEL];
+  double               mg_smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL];
+  double               mg_smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL];
 
   // parameters related to coarse grid deflation in the MG
   int                  mg_use_eig_solver[QUDA_MAX_MG_LEVEL];
diff --git a/read_input.l b/read_input.l
index 102c40b72..7ad0aa699 100644
--- a/read_input.l
+++ b/read_input.l
@@ -1985,6 +1985,18 @@ static inline double fltlist_next_token(int * const list_end){
   {SPC}*MGSetupCABasisLambdaMax{EQL}{STRLIST} {
     parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_setup_ca_lambda_max[0]));
   }
+  {SPC}*MGSmootherSolverCABasisType{EQL}{STRLIST} {
+    parse_quda_mg_cabasis_par_array(yytext, &(quda_input.mg_smoother_solver_ca_basis[0]));
+  }
+  {SPC}*MGSmootherSolverCABasisSize{EQL}{STRLIST} {
+    parse_quda_mg_int_par_array(yytext, &(quda_input.mg_smoother_solver_ca_basis_size[0]));
+  }
+  {SPC}*MGSmootherSolverCABasisLambdaMin{EQL}{STRLIST} {
+    parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_smoother_solver_ca_lambda_min[0]));
+  }
+  {SPC}*MGSmootherSolverCABasisLambdaMax{EQL}{STRLIST} {
+    parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_smoother_solver_ca_lambda_max[0]));
+  }
   {SPC}*MGCoarseSolverCABasisType{EQL}{STRLIST} {
     parse_quda_mg_cabasis_par_array(yytext, &(quda_input.mg_coarse_solver_ca_basis[0]));
   }
@@ -3876,6 +3888,11 @@ int read_input(char * conf_file){
     quda_input.mg_coarse_solver_ca_basis_size[level] = 4;
     quda_input.mg_coarse_solver_ca_lambda_min[level] = 0.0;
     quda_input.mg_coarse_solver_ca_lambda_max[level] = -1.0;
+    
+    quda_input.mg_smoother_solver_ca_basis[level] = QUDA_POWER_BASIS;
+    quda_input.mg_smoother_solver_ca_basis_size[level] = 4;
+    quda_input.mg_smoother_solver_ca_lambda_min[level] = 0.0;
+    quda_input.mg_smoother_solver_ca_lambda_max[level] = -1.0;
 
     /* note: when the user does not specify any blocking parameters,
      * a reasonable set will be computed automatically in the MG setup

From 1d3d0facf4e20c166e68adb5ed103cc302b3da01 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Fri, 5 May 2023 13:31:02 +0200
Subject: [PATCH 19/49] there is no setting for the smoother solver basis size

---
 quda_interface.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/quda_interface.c b/quda_interface.c
index 6e13002d9..362451614 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2014,7 +2014,6 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
     mg_param->coarse_solver_ca_lambda_max[level] = quda_input.mg_coarse_solver_ca_lambda_max[level];
     
     mg_param->smoother_solver_ca_basis[level]      = quda_input.mg_smoother_solver_ca_basis[level];
-    mg_param->smoother_solver_ca_basis_size[level] = quda_input.mg_smoother_solver_ca_basis_size[level];
     mg_param->smoother_solver_ca_lambda_min[level] = quda_input.mg_smoother_solver_ca_lambda_min[level];
     mg_param->smoother_solver_ca_lambda_max[level] = quda_input.mg_smoother_solver_ca_lambda_max[level];
    

From b0414e45bf5b81d0e844df6b69bd5cc5942f00b8 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Fri, 5 May 2023 18:28:45 +0200
Subject: [PATCH 20/49] expose MGCoarseGuess as its own parameter as its
 unrelated to the QUDA eigensolver and fix a few of the basic parameters of
 the eigensolver

---
 quda_interface.c |  1 +
 quda_types.h     |  1 -
 read_input.l     | 25 ++++++++++++-------------
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 362451614..bde44916c 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -1865,6 +1865,7 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
 
   mg_param->setup_type = QUDA_NULL_VECTOR_SETUP;
 
+  mg_param->coarse_guess = quda_input.mg_coarse_guess;
   mg_param->preserve_deflation = quda_input.mg_eig_preserve_deflation;
 
   mg_param->n_level = quda_input.mg_n_level;
diff --git a/quda_types.h b/quda_types.h
index a508fa683..6f84cf343 100644
--- a/quda_types.h
+++ b/quda_types.h
@@ -93,7 +93,6 @@ typedef struct tm_QudaParams_t {
   double               mg_coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL];
   
   QudaCABasis          mg_smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL];
-  int                  mg_smoother_solver_ca_basis_size[QUDA_MAX_MG_LEVEL];
   double               mg_smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL];
   double               mg_smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL];
 
diff --git a/read_input.l b/read_input.l
index 7ad0aa699..63ca96c3e 100644
--- a/read_input.l
+++ b/read_input.l
@@ -1838,6 +1838,14 @@ static inline double fltlist_next_token(int * const list_end){
     quda_input.mg_reset_setup_mdu_threshold = c;
     if(myverbose) printf("  MGResetSetupMDUThreshold set to %f line %d\n", quda_input.mg_reset_setup_mdu_threshold, line_of_file);
   }
+  {SPC}*MGCoarseGuess{EQL}yes {
+    quda_input.mg_coarse_guess = QUDA_BOOLEAN_YES;
+    if(myverbose) printf("  MGCoarseGuess set to YES in line %d\n", line_of_file);
+  }
+  {SPC}*MGCoarseGuess{EQL}no {
+    quda_input.mg_coarse_guess = QUDA_BOOLEAN_NO;
+    if(myverbose) printf("  MGCoarseGuess set to NO in line %d\n", line_of_file);
+  }
   {SPC}*MGUseEigSolver{EQL}{STRLIST} {
     parse_quda_mg_bool_par_array(yytext, &(quda_input.mg_use_eig_solver[0]));
   }
@@ -1929,14 +1937,6 @@ static inline double fltlist_next_token(int * const list_end){
     quda_input.mg_eig_preserve_deflation = QUDA_BOOLEAN_YES;
     if(myverbose) printf("  MGEigPreserveDeflationSubspace set to YES in line %d\n", line_of_file);
   }
-  {SPC}*MGEigSolverCoarseGuess{EQL}yes {
-    quda_input.mg_coarse_guess = QUDA_BOOLEAN_YES;
-    if(myverbose) printf("  MGEigSolverCoarseGuess set to YES in line %d\n", line_of_file);
-  }
-  {SPC}*MGEigSolverCoarseGuess{EQL}no {
-    quda_input.mg_coarse_guess = QUDA_BOOLEAN_NO;
-    if(myverbose) printf("  MGEigSolverCoarseGuess set to NO in line %d\n", line_of_file);
-  }
   {SPC}*MGEigSolverNumberOfVectors{EQL}{STRLIST} {
     parse_quda_mg_int_par_array(yytext, &(quda_input.mg_eig_nEv[0]));
   }
@@ -1988,9 +1988,6 @@ static inline double fltlist_next_token(int * const list_end){
   {SPC}*MGSmootherSolverCABasisType{EQL}{STRLIST} {
     parse_quda_mg_cabasis_par_array(yytext, &(quda_input.mg_smoother_solver_ca_basis[0]));
   }
-  {SPC}*MGSmootherSolverCABasisSize{EQL}{STRLIST} {
-    parse_quda_mg_int_par_array(yytext, &(quda_input.mg_smoother_solver_ca_basis_size[0]));
-  }
   {SPC}*MGSmootherSolverCABasisLambdaMin{EQL}{STRLIST} {
     parse_quda_mg_dbl_par_array(yytext, &(quda_input.mg_smoother_solver_ca_lambda_min[0]));
   }
@@ -3844,6 +3841,7 @@ int read_input(char * conf_file){
   quda_input.fermionbc = TM_QUDA_THETABC;
   quda_input.pipeline = 0;
   quda_input.gcrNkrylov = 10;
+  quda_input.coarse_guess = QUDA_BOOLEAN_NO;
   quda_input.reliable_delta = 1e-3; // anything smaller than this and we break down in double-half
   quda_input.mg_n_level = _default_quda_mg_n_level;
   quda_input.mg_setup_2kappamu = _default_quda_mg_setup_2kappamu;
@@ -3866,7 +3864,7 @@ int read_input(char * conf_file){
 
     quda_input.mg_use_eig_solver[level] = QUDA_BOOLEAN_NO;
     quda_input.mg_eig_preserve_deflation = QUDA_BOOLEAN_NO;
-    quda_input.mg_eig_tol[level] = 1.0e-3;
+    quda_input.mg_eig_tol[level] = 1.0e-6;
     quda_input.mg_eig_require_convergence[level] = QUDA_BOOLEAN_YES;
     quda_input.mg_eig_type[level] = QUDA_EIG_TR_LANCZOS;
     quda_input.mg_eig_spectrum[level] = QUDA_SPECTRUM_SR_EIG;
@@ -3878,6 +3876,8 @@ int read_input(char * conf_file){
     quda_input.mg_eig_poly_deg[level] = 100;
     quda_input.mg_eig_amin[level] = 1.0;
     quda_input.mg_eig_amax[level] = 5.0;
+    quda_input.mg_eig_nEv[level] = 0;
+    quda_input.mg_eig_nKr[level] = 0;
 
     quda_input.mg_setup_ca_basis[level] = QUDA_POWER_BASIS;
     quda_input.mg_setup_ca_basis_size[level] = 4;
@@ -3890,7 +3890,6 @@ int read_input(char * conf_file){
     quda_input.mg_coarse_solver_ca_lambda_max[level] = -1.0;
     
     quda_input.mg_smoother_solver_ca_basis[level] = QUDA_POWER_BASIS;
-    quda_input.mg_smoother_solver_ca_basis_size[level] = 4;
     quda_input.mg_smoother_solver_ca_lambda_min[level] = 0.0;
     quda_input.mg_smoother_solver_ca_lambda_max[level] = -1.0;
 

From 07959288dbdf822bbfb620770de18d243d790992 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Fri, 5 May 2023 18:29:16 +0200
Subject: [PATCH 21/49] make some more progress on the documentation of the
 coarse-grid deflated MG

---
 doc/quda.tex | 139 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 97 insertions(+), 42 deletions(-)

diff --git a/doc/quda.tex b/doc/quda.tex
index 6323c07ba..c2330fde4 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -17,8 +17,8 @@ \subsubsection{Design goals of the interface}
 	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). In the file {\ttfamily operators.c}, the QUDA library is initialized when an operator is initialized which has set {\ttfamily UseQudaInverter = yes}. There, the actual call to the inverter is conditionally replaced with a call to the QUDA interface.
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
-		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-0.7.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
-		\item The gaugefield is transferred each time to the GPU before the inversion starts in order to ensure not to miss any modifications of the gaugefield.
+		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-1.1.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
+    \item The state of the host and device gauge fields is kept synchronised automatically and the gauge field is only transferred if it changes. This tracking of the state also applies to the clover field and the MG preconditioner setup.
 	\end{itemize}
 \end{enumerate}
 
@@ -83,16 +83,47 @@ \subsubsection{Usage}
 \end{verbatim}
 and the operator of interest will be inverted using QUDA. The initialization of QUDA is done automatically within the operator initialization,  the QUDA library should be finalized by a call to {\ttfamily \_endQuda()} just before finalizing MPI. When you use the QUDA interface for work that is being published, don't forget to cite \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa}.
 
+At the operator level as well as for fermionic monomials in the HMC, the following additional parameters affect QUDA solvers:
+\begin{itemize}
+  \item \texttt{Solver}: Type of solver to employ. Possible values for QUDA:
+    \begin{itemize}
+      \item \texttt{CG} (conjugate gradient)
+      \item \texttt{CGMMS} (multishift conjugate gradient)
+      \item \texttt{CGMMSND} (multishift conjugate gradient for the non-degenerate operator)
+      \item \texttt{BICGSTAB} (bi-conjugate gradient, useful for non-twisted Wilson and Wilson-clover fermions)
+      \item \texttt{MG} (multigrid-preconditioned GCR)
+    \end{itemize}
+    Unlike the tmLQCD-native solvers, QUDA solvers are always run in mixed-precision and there is no differentiation between the names of mixed- and fixed-precision solvers, the inner solver precision is instead set by \texttt{UseSloppyPrecision}.
+    (default: \texttt{CG})
+  \item \texttt{UseCompression}: Employ gauge compression. Possible values: 
+    \begin{itemize}
+      \item $8$ (use 8-real gauge compression)
+      \item $12$ (use two-row gauge compression)
+      \item $18$ (disable gauge compression)
+    \end{itemize}
+    Since tmLQCD always uses twisted fermionic boundary conditions, the default $18$ is recommended unless you know what you are doing. Note that if the global fermionic boundary condition parameters \texttt{Theta[X,Y,Z,T]} are set non-zero, this setting is overriden by default. See \texttt{FermionBC} to force a particular type of fermionic boundary conditions in QUDA. Note that the tmLQCD residual check will fail if compression is enabled and non-twisted boundary conditions are forced.
+  \item \texttt{UseSloppyPrecision}: Employ this precision for the inner solver of a mixed-precision solver (outer solvers are always run in double precision). Possible values:
+    \begin{itemize}
+      \item \texttt{no} / \texttt{double} (inner solver running in double precision)
+      \item \texttt{single} / \texttt{float} (inner solver running in single precision)
+      \item \texttt{half} (inner solver running in half precision)
+    \end{itemize}
+    (string, default: \texttt{double})
+  \item \texttt{RefinementPrecision}: When the operator or monomial uses the multishift (\texttt{cgmms[nd]}) solver and offloads to QUDA, this parameter sets the inner solver precision of shift-by-shift refinement solves. In practice, one might set \texttt{UseSloppyPrecision = single} and \texttt{RefinementPrecision = half}. This will iterate the residuals in the multishift solver up to single precision and then refine each solution using a double-half mixed-precision CG.
+\end{itemize}
+
+In additition, for the gauge monomial, the parameter \texttt{UseExternalLibrary = quda} can be used to offload the gauge force to QUDA.
+
+Finally, for the \texttt{GRADIENTFLOW} online measurement, the parameter \texttt{UseExternalLibrary = quda} will offload the gradient flow to QUDA.
+
 \subsubsection{General settings}
 Some properties of the QUDA interface can be configured via the {\ttfamily ExternalInverter} section.
-\begin{verbatim}
-BeginExternalInverter QUDA
-  FermionBC = [theta, pbc, apbc]
-EndExternalInverter
-\end{verbatim}
-
-The option {\ttfamily FermionBC} shown above forces twisted ({\ttfamily theta}), periodic ({\ttfamily pbc}) or antiperiodic ({\ttfamily apbc}) temporal quark field boundary conditions.
-This setting exists because at the time of writing (2017.12.28), there seems to be a bug or incompatibility in QUDA which causes (anti-)periodic boundary conditions with gauge compression to produce incorrect propagators.
+\begin{itemize}
+  \item \texttt{FermionBC} Forces twisted ({\ttfamily theta}), periodic ({\ttfamily pbc}) or antiperiodic ({\ttfamily apbc}) temporal quark field boundary conditions irrespective of what has been set for \texttt{ThetaT}. This setting exists because at the time of writing (2017.12.28), there seems to be a bug or incompatibility in QUDA which causes (anti-)periodic boundary conditions with gauge compression to produce incorrect propagators. Use with care as the residual check using tmLQCD operators will suggest a non-converged residual.
+  \item \texttt{Pipeline} The pipeline length for fused operations in some solvers (for the tmLQCD QUDA interface, at the time of writing in May 2023, this is just GCR). (positive integer, default: $0$)
+  \item \texttt{gcrNkrylov} 
+  \item \texttt{ReliableDelta}
+\end{itemize}
 
 \subsubsection{QUDA-MG interface}
 The interface has support for the QUDA Multigrid (MG) solver implementation and allows a number of parameters to be adjusted in order to tune the MG setup.
@@ -127,15 +158,23 @@ \subsubsection{QUDA-MG interface}
       \item \texttt{CACGNR} (communication-avoiding conjugate gradient normal residuals)
       \item \texttt{BiCGstab} (stabilised bi-conjugate gradient)
     \end{itemize}
-    Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks while \texttt{CG} should always be used for twised mass or twisted clover quarks. \texttt{CACG} may be beneficial at the strong-scaling limit. At the time of writing (May 2023), this cannot be set on a per-level basis so the effect of the communication-avoiding setup solver on the setup time may be limited or even detrimental.
+    Usage of \texttt{BiCGstab} may be recommended for Wilson or clover Wilson quarks while \texttt{CG} should always be used for twised mass or twisted clover quarks. \texttt{CACG} may be beneficial at the strong-scaling limit, see also \texttt{MGSetupCABasisType} and \texttt{MGSetupCABasisSize}. At the time of writing (May 2023), this cannot be set on a per-level basis so the effect of the communication-avoiding setup solver on the setup time may be limited or even detrimental.
         (default: \texttt{CG})}
-  \item{ \texttt{MGSetupSolverTolerance}: relative target residual (unsquared!) during setup phase on a per-level basis. (comma-separated list of positive floats, default: $1\cdot10^{-6}, 1\cdot10^{-6}, \ldots$) }
-  \item{ \texttt{MGSetupMaxSolverIterations}: maximum number of iterations during setup phase on a per-level basis. (comma-separated list of positive integers, default: $1000, 1000, \ldots$) }
-  \item{ \texttt{MGCoarseSolverTolerance}: unsquared relative target residual on the coarse grids on a per-level basis. (comma-separated list of positive floats, default: $0.25, 0.25, \ldots$) }
-  \item{ \texttt{MGNumberOfVectors}: number of null vectors to compute on a per-level basis. (comma-separated list of integers, possible values $\left[ 24, 32 \right]$, default: $24, 24, \ldots$)}
-  \item{ \texttt{MGCoarseMaxSolveriterations}: maximum number of iterations on coarse grids on a per-level basis. (comma-separated list of positive integers, default: $75, 75, \ldots$) }
+  \item{ \texttt{MGVerbosity}: Verbosity of the solver on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{silent} (suppress any additional output)
+      \item \texttt{summarize} (summary information such as number of iterations and achieved residual in the respective smoother or solver)
+      \item \texttt{verbose} (verbose information such as per-iteration residual printing of the respective solver or smoother)
+      \item \texttt{debug\_verbose} (extremely verbose information at each iteration of the respective smoother or solver)
+    \end{itemize}
+    (comma-separated list of strings, default: \texttt{silent} on all levels)}
+  \item{ \texttt{MGSetupSolverTolerance}: Relative target residual (unsquared!) during setup phase on a per-level basis. (comma-separated list of positive floats, default: $1\cdot10^{-6}$ on all levels) }
+  \item{ \texttt{MGSetupMaxSolverIterations}: Maximum number of iterations during setup phase on a per-level basis. (comma-separated list of positive integers, default: $1000$ on all levels) }
+  \item{ \texttt{MGCoarseSolverTolerance}: Unsquared relative target residual on the coarse grids on a per-level basis. (comma-separated list of positive floats, default: $0.25$ on all levels) }
+  \item{ \texttt{MGNumberOfVectors}: Number of null vectors to compute on a per-level basis. (comma-separated list of integers, possible values $\left[ 24, 32 \right]$, default: $24$ on all levels)}
+  \item{ \texttt{MGCoarseMaxSolveriterations}: Maximum number of iterations on coarse grids on a per-level basis. (comma-separated list of positive integers, default: $75$ on all levels) }
   \item{ \texttt{MGEnableSizeThreeBlocks}: Historically and by default, QUDA has had limited support for size $3$ aggregates. If set to \emph{yes}, the automatic blocking algorithm will attempt to use them for lattice extents divisible by $3$ when the local lattice extent at a given level is smaller than $16$ aggregate sites. This may require you to instantiate the necessary block sizes in QUDA (see comments below). In recent (2022 and later) commits of the QUDA \texttt{develop} branch, this can usually be set to \emph{yes} without having to modify QUDA. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
-  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: aggregate sizes on each level. When these are set non-zero for a given lattice dimension and a given level, the automatic blocking algorithm for that dimension and level is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of positive integers, for a three level solver, for example, two numbers needs to be specified: aggregate size on the finest and intermediate level, default: \texttt{0, 0})}
+  \item{ \texttt{MGBlockSizes[X,Y,Z,T]}: Aggregate sizes on each level. When these are set non-zero for a given lattice dimension and a given level, the automatic blocking algorithm for that dimension and level is overridden and the specified blockings are forced. When the required aggregate sizes are not instantiated in QUDA, the setup phase will fail with an informative error message. (comma-separated list of positive integers, for a three level solver, for example, two numbers needs to be specified: aggregate size on the finest and intermediate level, default: \texttt{0, 0})}
 \end{itemize}
 
 \paragraph*{Automatic blocking:} If no blocking is specified manually, the aggregation parameters are set automatically as follows:
@@ -148,22 +187,22 @@ \subsubsection{QUDA-MG interface}
 
 \paragraph{Further basic MG parameters:}
 \begin{itemize}
-  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on a per-level basis. (comma-separated list of positive floats, default: $0.25, 0.25, \ldots$) }
-  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction on a per-level basis. (comma-separated list of zero or positive integers, default: $0, 0, \ldots$)}
-  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation on a per-level basis. (comma-separated list of zero or positive integers, default: $4, 4, \ldots$)}
-  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor on a per-level basis. (comma-separated list of positive floats, default: $0.85, 0.85, \ldots$)}
+  \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on a per-level basis. (comma-separated list of positive floats, default: $0.25$ on all levels) }
+  \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction on a per-level basis. (comma-separated list of zero or positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation on a per-level basis. (comma-separated list of zero or positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor on a per-level basis. (comma-separated list of positive floats, default: $0.85$ on all levels)}
   \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid solves. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but sometimes it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. When using coarse-grid deflation (see \texttt{MGEigSolverRequireConvergence}), this should usually be set to $1.0$ on all levels. (comma-separated list of positive floats, usually $ > 1.0$, default $8.0$ from the second level upwards).}
   \item{ \texttt{MGSetup2KappaMu}: The value of $2\kappa\mu$ which should be used during the MG setup process. This is important in the HMC for standard twisted mass fermions, for example, because the setup should always be performed with the smallest quark mass to be employed in a simulation and it might be that a monomial with a heavier twisted quark mass is the first to call to MG and to thus trigger the setup. Generally this is set to the target light twisted quark mass. Setting this to $0.0$ implies that it is ignored. (float, default: $0.0$) }
-  \item{ \texttt{MGReuseSetupMuThreshold}: When the twisted quark mass is changed between solves using the MG solver, the MG setup is usually \emph{updated} for this new $\mu$ value. One can attempt to reuse the MG setup for solves with different mu values up to this threshold, i.e., when the condition $x < 2\kappa\cdot|\mu_\mathrm{old} - \mu_\mathrm{new}|$ holds. (positive float, default: \texttt{2*DBL\_EPSILON})}
+  \item{ \texttt{MGReuseSetupMuThreshold}: When the twisted quark mass is changed between solves using the MG solver, the MG setup is usually \emph{updated} for this new $\mu$ value. One can attempt to reuse the MG setup for solves with different $\mu$ values up to this threshold, i.e., when the condition $x < 2\kappa\cdot|\mu_\mathrm{old} - \mu_\mathrm{new}|$ holds. (positive float, default: \texttt{2*DBL\_EPSILON})}
   \item{ \texttt{MGRefreshSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be regularly refreshed by running a few iterations of the setup solver on the current set of approximate null vectors in order to evolve these with the changing gauge field. A good rule of thumb is to perform this setup refresh about twice per coarsest time step. In other words: for a trajectory length $\tau$ and $N$ integration steps on the coarsest time scale, the refresh should be performed at intervals of $(\tau/(2N)-\epsilon)$ MDUs, where $\epsilon$ is a small number to make sure that the threshold is hit at every half-step of the integrator. (positive float, default: \texttt{2*DBL\_EPSILON})}
-  \item{ \texttt{MGRefreshSetupMaxSolverIterations}: Number of setup iterations to be performed on a per-level basis when a setup refresh is required, see also \texttt{MGRefreshSetupMDUThreshold} above. (comma-separated list of positive integers, default: $0, 0, \ldots$}
+  \item{ \texttt{MGRefreshSetupMaxSolverIterations}: Number of setup iterations to be performed on a per-level basis when a setup refresh is required, see also \texttt{MGRefreshSetupMDUThreshold} above. This must be specified for the HMC and a value between 20 and 35 is usually sufficient. (comma-separated list of positive integers, default: $0$ on all levels}
   \item{ \texttt{MGResetSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be reset (instead of being refreshed) if the gauge configuration changes significantly at some point. This is the case, for example, when a trajectory is rejected. This value must always be set and a possible choice is to set it to the trajectory length. Note that there is an interaction with the preprocessor constant \texttt{TM\_GAUGE\_PROPAGATE\_THRESHOLD}, the default value of which is $3.0$. This is because when a trajectory is rejected, an internal variable which tracks the state of the gauge field is incremented by \texttt{TM\_GAUGE\_PROPAGATE\_THRESHOLD} to ensure that everything that depends on the gauge field is properly updated. This means that if the trajectory length $\tau \geq 3.0$, one should ignore the recommendation and set $x < 3.0$. In general, any value significantly larger than \texttt{MGRefreshSetupMDUThreshold} should be fine. (positive float, default: \texttt{2*DBL\_EPSILON}) }
   \item{ \texttt{MGRunVerify}: Check GPU coarse-grid operators against CPU reference implementation and verify Galerkin projectors during setup phase. This is usually fast enough to always be performed, although sometimes it seems to fail even though the setup works fine. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
   \item{ \texttt{MGRunObliqueProjectionCheck}: Measure how well the null vector subspace adjusts the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
   \item{ \texttt{MGLowModeCheck}: Measure how well the null vector subspace overlaps with the low eigenmode subspace. (boolean \emph{yes} or \emph{no}, default: \emph{no}) }
 \end{itemize}
 
-The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about one to two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass and the used ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids. This kind of compensation will lead to a setup which is algorithmically less efficient (more matrix-vector products) but may have a lower overall time-to-solution.
+The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about one to two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass as well as the underlying ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids. This kind of compensation will lead to a setup which is algorithmically less efficient (more matrix-vector products) but may have a lower overall time-to-solution.
 
 \paragraph{Advanced MG parameters:}
 \begin{itemize}
@@ -175,40 +214,50 @@ \subsubsection{QUDA-MG interface}
     \end{itemize}
     The recommendation is to use \texttt{gcr} on all intermediate and \texttt{cagcr} on the coarsest level.
     (comma-separated list of strings, default: \texttt{gcr} on all levels)}
-  \item{ \texttt{MGCoarseSolverCABasisType}: The type of basis to use for the communication-avoiding solver on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} for the given level. Possible values:
+  \item{ \texttt{MGSmootherType}: The type of algorithm used as a smoother on a per-level basis. Possible values:
+    \begin{itemize}
+      \item \texttt{cagcr} (communication-avoiding generalised conjugate residual)
+      \item \texttt{mr} (minimal residual)
+    \end{itemize}
+    It is often beneficial to employ \texttt{cagcr} on all levels as a smoother.
+    (comma-separated list of strings, default: \texttt{cagcr} on all levels)}
+\end{itemize}
+The fine-tuning parameters for the communication-avoiding solver are the same whether it is used as a smoother or coarse-grid solver.
+\begin{itemize}
+  \item{ \texttt{MGCoarseSolverCABasisType} and \texttt{MGSmootherSolverCABasisType}: The type of basis to use for the communication-avoiding solver on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} (\texttt{MGSmootherType}) for the given level. Possible values:
   \begin{itemize}
     \item \texttt{power} (power basis)
     \item \texttt{chebyshev} (Chebyshev basis)
   \end{itemize}
     (comma-separated list of strings, default: \texttt{power} on all levels)}
-  \item{ \texttt{MGCoarseSolverCABasisSize}: The size of the polynomial basis to use if a communication-avoiding solver is used on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} for the given level. Values larger than $4$ require \texttt{chebyshev} to be set for the corresponding \texttt{MGCoarseSolverCABasisType}. (comma-separated list of positive integers, default: $4$ on all levels)}
-  \item{ \texttt{MGCoarseSolverCABasisLambdaMin}: Minimum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType}. The default value of $0.0$ is recommened for safety. (comma-separated list of postitive floats, default: $0.0$ on all levels)}
-  \item{ \texttt{MGCoarseSolverCABasisLambdaMax}: Maximum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType}. The default value of $-1.0$ triggers power iterations which estimate this automatically. (comma-separated list of positive floats, default: $-1.0$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisSize}: The size of the polynomial basis to use if a communication-avoiding solver is used on a per-level basis. Ignored unless \texttt{cagcr} is set for \texttt{MGCoarseSolverType} for the given level. Values larger than $4$ require \texttt{chebyshev} to be set for the corresponding \texttt{MGCoarseSolverCABasisType}. There is no corresponding parameter for the smoother. (comma-separated list of positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisLambdaMin} and \texttt{MGSmootherSolverCABasisLambdaMin}: Minimum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType} (\texttt{MGSmootherSolverCABasisType}). The default value of $0.0$ is recommened for safety. (comma-separated list of postitive floats, default: $0.0$ on all levels)}
+  \item{ \texttt{MGCoarseSolverCABasisLambdaMax} and \texttt{MGSmootherSolverCABasisLambdaMax}: Maximum eigenvalue specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGCoarseSolverCABasisType} (\texttt{MGSmootherSolverCABasisType}). The default value of $-1.0$ triggers power iterations which estimate this automatically. (comma-separated list of positive floats, default: $-1.0$ on all levels)}
+\end{itemize}
+The fine-tuning parameters for the communication-avoiding solvers used in the MG setup are nominally the same as for the coarse-grid solver and smoother but this may diverge in principle as different solvers are employed in the setup.
+\begin{itemize}
   \item{ \texttt{MGSetupCABasisType}: The type of polynomial basis to use if a communication-avoiding solver is used in the setup on a per-level basis. Possible values:
   \begin{itemize}
     \item \texttt{power} (power basis)
-    \item \texttt{chebyshev} (Chebyshev)
+    \item \texttt{chebyshev} (Chebyshev basis)
   \end{itemize}
     The recommendation is to use \texttt{chebyshev} for stability reasons. (comma-separated list of strings, default: \texttt{power} on all levels)}
-  \item{ \texttt{MGSmootherType}: The type of algorithm used as a smoother on a per-level basis. Possible values:
-    \begin{itemize}
-      \item \texttt{cagcr} (communication-avoiding generalised conjugate residual)
-      \item \texttt{gcr} (generalised conjugate residual)
-      \item \texttt{mr} (minimal residual)
-    \end{itemize}
-    (comma-separated list of strings, default: \texttt{cagcr} on all levels)}
+  \item{ \texttt{MGSetupCABasisSize}: The size of the polynomial basis to use if a communication-avoiding solver is used in the setup on a per-level basis. Values larger than $4$ require \texttt{chebyshev} to be set for the corresponding \texttt{MGSetupCABasisType}. (comma-separated list of positive integers, default: $4$ on all levels)}
+  \item{ \texttt{MGSetupCABasisLambdaMin}: Minimum eigenvalues specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGSetupCABasisType}. The default value of $0.0$ is recommended for safety. (comma-separated list of positive floats, default: $0.0$ on all levels)}
+  \item{ \texttt{MGSetupCABasisLambdaMax}: Maximum eigenvalues specified for each level if the Chebyshev basis is used for the corresponding \texttt{MGSetupCABasisType}. The default value of $-1.0$ triggers power iterations which estimate this automatically. (comma-separated list of positive floats, default: $-1.0$ on all levels)}
 \end{itemize}
 
-\paragraph{Coarse-grid-deflated solver:}
+\paragraph{Coarse-grid-deflated MG solver:}
 Especially with twisted mass quarks, the coarse-grid operator is very poorly conditioned. One way of improving the convergence of the solver is the scaling of the $\mu$ parameter on the coarsest and intermediate grid as described in \texttt{MGCoarseMuFactor} above. A different and more potent way which pays off for propagator inversions (but is not suitable for the HMC) is to use coarse-grid deflation. In this case, the first $n_\mathrm{ev}$ eigenvectors of the coarse-grid operator are computed and the resulting deflation subspace is used to precondition the system. In this case, experience seems to indicate that \texttt{MGCoarseMuFactor} should be set to $1.0$ everywhere as it is no longer required to improve convergence. The options of QUDA's eigensolver are set via the following set of parameters:
 \begin{itemize}
-  \item{ \texttt{MGUseEigSolver}: Whether or not the eigensolver should be run for the operator on a per-level basis. The most common use-case would be to enable the eigensolver on the coarsest grid. (comma-separated list of booleans \emph{yes} or \emph{no}, default: \emph{no, no, \ldots})}
+  \item{ \texttt{MGUseEigSolver}: Whether or not the eigensolver should be run for the operator on a per-level basis. The most common use-case would be to enable the eigensolver on the coarsest grid. (comma-separated list of booleans \emph{yes} or \emph{no}, default: \emph{no} on all levels)}
+  \item{ \texttt{MGEigSolverRequireConvergence}: Require that the eigensolver actually must converge for the requested number of eigenvectors on a per-level basis as specified by \texttt{MGEigSolverNumberOfVectors}. Ignored if the corresponding \texttt{MGUseEigSolver} is set to \texttt{no}. (comma-separated list of booleans, default: \texttt{yes} on all levels)}
   \item{ \texttt{MGEigSolverType}: Type of eigensolver to use on a per-level basis. Possible values:
     \begin{itemize}
       \item \texttt{tr\_lanczos} (truncated Lanczos)
       \item \texttt{ir\_arnoldi} (implicitly-restarted Arnoldi)
     \end{itemize}
-    (comma-separated list of strings, default: \texttt{tr\_lanczos, tr\_lanczos, \ldots})}
+    (comma-separated list of strings, default: \texttt{tr\_lanczos} on all levels)}
   \item{ \texttt{MGEigSolverSpectrum}: Which part of the spectrum the eigensolver should focus on specified on a per-level basis. Possible values:
     \begin{itemize}
       \item \texttt{smallest\_real} (smallest eigenvalues based on their real part)
@@ -219,15 +268,21 @@ \subsubsection{QUDA-MG interface}
       \item \texttt{largest\_modulus} (largest eigenvalues in modulus)
     \end{itemize}
     For the purpose of deflating the coarse-grid system, the usual choice is \texttt{smallest\_real}.
-    (comma-separated list of strings, default: \texttt{smallest\_real, smallest\_real, \ldots})}
+    (comma-separated list of strings, default: \texttt{smallest\_real} on all levels)}
+  \item{ \texttt{MGEigSolverCheckConvergenceInterval}: When using an implicitly-restarted solver, perform this many restarts between convergence checks. (comma-separated list of positive integers, default: $5$ on all levels)}
+  \item{ \texttt{MGEigSolverMaxRestarts}: Perform at most this many restarts in the eigensolver specified on a per-level basis. (comma-separated list of positive integers, default: $100$ on all levels)}
+  \item{ \texttt{MGEigSolverTolerance}: The tolerance to use in the eigensolver on a per-level basis. (comma-separated list of floats, default: $1.0\cdot10^{-6}$ on all levels)}
+  \item{ \texttt{MGEigPreserveDeflationSubspace}: When a coarse-grid-deflated MG setup is updated between a change of the $\mu$ parameter, setting this to \texttt{yes} will preserve the deflation subspace. If set to \texttt{no}, the deflation subspace would be destroyed and regenerated upon a change of $\mu$. (boolean, default: \texttt{yes})}
+  \item{ \texttt{MGEigSolverNumberOfVectors}: Number of eigenvectors to compute on a per-level basis. Usually, this is intended for the coarsest level where, depending on the lattice size and physical parameters, numbers between $512$ and $3072$ may be required. (comma-separated list of positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGEigSolverUsePolynomialAcceleration}: Whether or not to use polynomial acceleration in the eigensolver on a per-level basis. (comma-separated list of booleans, default: \texttt{yes} everywhere)}
 \end{itemize}
 
-Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted.
+Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted. It should be noted that the null vectors are kept in half precision as recommended, however.
 
 A typical MG setup might look like this for twisted mass clover quarks: 
 
 \begin{verbatim}
-{BeginExternalInverter QUDA
+BeginExternalInverter QUDA
   MGNumberOfLevels = 3
   MGSetupSolver = cg
   MGSetupSolverTolerance = 1e-6

From 4b7e737b9bb95ea3dcfa79ec65ada2bae35a8033 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Tue, 9 May 2023 17:37:55 +0200
Subject: [PATCH 22/49] TM_USE_QUDA is checked before calling the interface

---
 phmc.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/phmc.c b/phmc.c
index df36e8c65..32900382e 100644
--- a/phmc.c
+++ b/phmc.c
@@ -229,6 +229,7 @@ void phmc_compute_ev(const int trajectory_counter,
 
   no_eigenvalues = 1;
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
+  #ifdef TM_USE_QUDA
     temp = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0,
                         mnl->accprec, mnl->maxiter, mnl->solver, g_relative_precision_flag,
                                                        1, // we only support even-odd here
@@ -238,6 +239,15 @@ void phmc_compute_ev(const int trajectory_counter,
     if( fabs(mnl->EVMax - 1) < 2*DBL_EPSILON ) {
       temp = temp / mnl->StildeMax;
     }
+  #else
+    if(g_proc_id == 0) {
+      fprintf(stderr, "Error: Attempted to use QUDA eigensolver but this build was not configured for QUDA usage.\n");
+    #ifdef TM_USE_MPI
+      MPI_Finalize();
+    #endif
+      exit(-2);
+    }
+  #endif
   }else {
     temp = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq);
   }
@@ -245,6 +255,7 @@ void phmc_compute_ev(const int trajectory_counter,
   
   no_eigenvalues = 1;
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
+  #ifdef TM_USE_QUDA
     temp2 = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1,
                          mnl->accprec, mnl->maxiter, mnl->solver, g_relative_precision_flag,
                                                        1, // we only support even-odd here
@@ -254,6 +265,15 @@ void phmc_compute_ev(const int trajectory_counter,
     if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
       temp2 = temp2 / mnl->StildeMax;
     }
+  #else
+    if(g_proc_id == 0) {
+      fprintf(stderr, "Error: Attempted to use QUDA eigensolver but this build was not configured for QUDA usage.\n");
+    #ifdef TM_USE_MPI
+      MPI_Finalize();
+    #endif
+      exit(-2);
+    }
+  #endif
   }else {
     temp2 = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq);
   }

From a0d71d38422788b8e4ef23b459b17bc00eafb61d Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Tue, 9 May 2023 17:49:57 +0200
Subject: [PATCH 23/49] modifying eigenvalue_precision in the same fashion as
 eigenvalues_bi

---
 quda_interface.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index fc8d1f51f..e33510ad2 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2633,8 +2633,15 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   eig_param = newQudaEigParam();
 
   eig_param.invert_param = &inv_param;
-  eig_param.tol = tol;
-  eig_param.qr_tol = tol;
+  
+  if(tol < 1.e-14) {
+    eig_param.tol = 1.e-14;
+    eig_param.qr_tol = 1.e-14;
+  }else {
+    eig_param.tol = tol;
+    eig_param.qr_tol = tol;
+  }
+  
 
     
   if(blkwise == 1) {

From 6655987102fed07a14ec38f891c66262e731f190 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 16 May 2023 13:49:21 +0200
Subject: [PATCH 24/49] finally provide a complete documentation of the
 available MG parameters

---
 doc/bibliography.bib |  14 +++
 doc/quda.tex         | 221 +++++++++++++++++++++++++++++++------------
 2 files changed, 172 insertions(+), 63 deletions(-)

diff --git a/doc/bibliography.bib b/doc/bibliography.bib
index 1e4fcf20c..616d45e69 100644
--- a/doc/bibliography.bib
+++ b/doc/bibliography.bib
@@ -7904,3 +7904,17 @@ @inbook{Joo2013
 	year = "2013"
 }
 
+@article{Kostrzewa:2022hsv,
+    author = "Kostrzewa, Bartosz and Bacchio, Simone and Finkenrath, Jacob and Garofalo, Marco and Pittler, Ferenc and Romiti, Simone and Urbach, Carsten",
+    collaboration = "ETM",
+    title = "{Twisted mass ensemble generation on GPU machines}",
+    eprint = "2212.06635",
+    archivePrefix = "arXiv",
+    primaryClass = "hep-lat",
+    doi = "10.22323/1.430.0340",
+    journal = "PoS",
+    volume = "LATTICE2022",
+    pages = "340",
+    year = "2023"
+}
+
diff --git a/doc/quda.tex b/doc/quda.tex
index c2330fde4..00baa73b9 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -1,20 +1,20 @@
 %author: Mario Schroeck <mario.schroeck@roma3.infn.it>
 %author: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
 %date: 04/2015
-%date: 06/2017, 12/2017, 06/2018
+%date: 06/2017, 12/2017, 06/2018, 08/2019, 05/2022, 05/2023
 
 \subsection{QUDA: A library for QCD on GPUs}\label{subsec:quda}
 
 
-The QUDA \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa} interface provides access to QUDA's solvers both in the HMC and as a solver interface for propagator production.
+The QUDA \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa} interface provides access to QUDA's solvers both in the HMC~\cite{Kostrzewa:2022hsv} and as a solver interface for propagator production.
 
 \subsubsection{Design goals of the interface}
 The QUDA interface has been designed with the following goals in mind, sorted by priority:
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
 	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
-	\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseQudaInverter} is introduced which, when set to {\ttfamily yes}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} are supported.\footnote{{\ttfamily DBCLOVER} is supported by the interface but not by QUDA as of version 0.7.0.}
-	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). In the file {\ttfamily operators.c}, the QUDA library is initialized when an operator is initialized which has set {\ttfamily UseQudaInverter = yes}. There, the actual call to the inverter is conditionally replaced with a call to the QUDA interface.
+\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. In the HMC, the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials.
+	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
 		\item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-1.1.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below.
@@ -125,10 +125,24 @@ \subsubsection{General settings}
   \item \texttt{ReliableDelta}
 \end{itemize}
 
+\subsubsection{More advanced settings}
+To achieve higher performance you may choose single (default) or even half precision as sloppy precision for the inner solver of the mixed precision inverter with reliable updates. After {\ttfamily BeginOperator} and before {\ttfamily EndOperator} set {\ttfamily UseSloppyPrecision = double|single|half}.
+The MG-preconditioned GCR solver only works in double-single mixed precision, but the null vectors are stored in half precision as recommended by Kate Clark.
+
+To activate compression of the gauge fields (in order to save bandwidth and thus to achieve higher performance), set {\ttfamily UseCompression = 8|12|18} within {\ttfamily BeginOperator} and {\ttfamily EndOperator}. 
+The default is 18 which corresponds to no compression. 
+Note that if you use compression, trivial (anti)periodic boundary conditions will be applied to the gauge fields, instead of the default $\theta$-boundary conditions. 
+As a consequence, the residual check on tmLQCD side will fail. 
+Moreover, compression is not applicable when using general $\theta$-boundary conditions in the spatial directions. 
+If trying to do so, compression will be de-activated automatically and the user gets informed via the standard output.
+The \texttt{FermionBC} setting can be used to force particular temporal boundary conditions to be applied to the gauge field in the Dirac operator.
+
+\subsubsection{Functionality}
+The QUDA interface can currently be used to invert {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} within a 4D multi-GPU (MPI) parallel environment with CG, BICGSTAB or MG-preconditioned GCR. QUDA uses even-odd preconditioning, if wanted ({\ttfamily UseEvenOdd = yes}), and the interface is set up to use a mixed precision solver by default. For more details on the QUDA settings check the function {\ttfamily \_initQuda()} in {\ttfamily quda\_interface.c}.
+
 \subsubsection{QUDA-MG interface}
 The interface has support for the QUDA Multigrid (MG) solver implementation and allows a number of parameters to be adjusted in order to tune the MG setup.
 The defaults for these parameters follow the recommendations of \url{https://github.com/lattice/quda/wiki/Multigrid-Solver}, which also provides useful hints for further tuning.
-Although some of the parameters can be set on a per-level basis, the interface currently only exposes a single setting for all levels, where appropriate.
 The K-cycle is used by default and there is currently no user-exposed option for changing this.
 
 The MG-preconditioned GCR solver is selected as follows:
@@ -185,6 +199,52 @@ \subsubsection{QUDA-MG interface}
   \item{ In all other cases, aggregation is disabled for this direction and level. This includes, for instance, extents divisible by primes other than $2$ or $3$. This means that if you need blocks of $5$ or $7$ for geometry reasons, you will need to choose a setup manually.}
 \end{itemize}
 
+A typical MG setup might look like this for twisted mass clover quarks: 
+
+\begin{verbatim}
+BeginExternalInverter QUDA
+  MGNumberOfLevels = 3
+  MGSetupSolver = cg
+  MGSetupSolverTolerance = 1e-6, 1e-6
+  MGSetupMaxSolverIterations = 1000, 1000
+  MGCoarseSolverTolerance = 0.25, 0.25, 0.25
+  MGCoarseSolverIterations = 75, 75, 75
+  MGSmootherTolerance = 0.25, 0.25, 0.25
+  MGSmootherPreIterations = 2, 2, 2
+  MGSmootherPostIterations = 4, 4, 4
+  MGOverUnderRelaxationFactor = 0.85, 0.85, 0.85
+  MGCoarseMuFactor = 1.0, 1.0, 12.0
+  MGNumberOfVectors = 24, 24, 32
+  MGRunVerify = yes
+  MGEnableSizeThreeBlocks = no
+EndExternalInverter
+\end{verbatim}
+
+Alternatively, a blocking can be specified manually:
+
+\begin{verbatim}
+BeginExternalInverter QUDA
+  MGNumberOfLevels = 3
+  MGBlockSizesX = 4, 3
+  MGBlockSizesY = 4, 3
+  MGBlockSizesZ = 6, 4
+  MGBlockSizesT = 6, 4
+  MGSetupSolver = cg
+  MGSetupSolverTolerance = 1e-6, 1e-6
+  MGSetupMaxSolverIterations = 1000, 1000
+  MGCoarseSolverTolerance = 0.25, 0.25, 0.25
+  MGCoarseSolverIterations = 75, 75, 75
+  MGSmootherTolerance = 0.25, 0.25, 0.25
+  MGSmootherPreIterations = 2, 2, 2
+  MGSmootherPostIterations = 4, 4, 4
+  MGOverUnderRelaxationFactor = 0.85, 0.85, 0.85
+  MGCoarseMuFactor = 1.0, 1.0, 12.0
+  MGRunVerify = yes
+EndExternalInverter
+\end{verbatim}
+
+Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted. It should be noted that the null vectors are kept in half precision as recommended, however.
+
 \paragraph{Further basic MG parameters:}
 \begin{itemize}
   \item{ \texttt{MGSmootherTolerance}: unsquared relative target residual of the smoother on a per-level basis. (comma-separated list of positive floats, default: $0.25$ on all levels) }
@@ -204,6 +264,47 @@ \subsubsection{QUDA-MG interface}
 
 The default parameters and an automatically determined aggregation will generally produce a solver which will outperform mixed-precision CG by about one to two orders of magnitude (at the physical point) as long as the value for \texttt{MGCoarseMuFactor} on the coarsest grid is tuned appropriately. Further refinements are possible and improvements by up to another order of magnitude are achievable through smart choices for parameters related to the smoother, the number of vectors used on each level and the tolerances related to the smoother and coarse-grid solvers. These details are dependent on the target quark mass as well as the underlying ensemble and they may be dependent on the particular configuration (for certain ensembles, time to solution fluctuates quite strongly depending on the point in the molecular dynamics history). Finally, the tuning of the algorithmic parameters depends on the characteristics of the particular machine as one may compensate, for example, a poorly performing coarsest-grid operator through more iterations on the intermediate and fine grids. This kind of compensation will lead to a setup which is algorithmically less efficient (more matrix-vector products) but may have a lower overall time-to-solution.
 
+For use in the HMC, a typical MG setup might look something like:
+
+\begin{verbatim}
+BeginExternalInverter QUDA
+  EnablePinnedMemoryPool = yes 
+  EnableDeviceMemoryPool = no
+
+  Pipeline = 16
+  gcrNkrylov = 24
+  MGRunVerify = no
+  MGNumberOfLevels = 3
+  MGNumberOfVectors = 24, 32
+  MGSetupSolver = cg
+  MGSetup2KappaMu = 0.00014900994792
+  MGVerbosity = silent, silent, silent
+  MGSetupSolverTolerance = 5e-7, 5e-7
+  MGSetupMaxSolverIterations = 1500, 1500
+  MGCoarseSolverType = gcr, gcr, cagcr
+
+  MGCoarseMuFactor = 1.0, 1.75, 130.0
+  MgCoarseSolverTolerance = 0.1, 0.25, 0.25
+  MGCoarseMaxSolverIterations = 15, 10, 10
+
+  MGSmootherType = cagcr, cagcr, cagcr
+  MGSmootherTolerance = 0.15, 0.15, 0.1
+
+  MGSmootherPreIterations = 0, 0, 1
+  MGSmootherPostIterations = 1, 3, 1
+
+  MGBlockSizesX = 4, 4
+  MGBlockSizesY = 4, 4
+  MGBlockSizesZ = 4, 2
+  MGBlockSizesT = 4, 2
+
+  MGOverUnderRelaxationFactor = 0.95, 0.95, 0.95
+  MGRefreshSetupMDUThreshold = 0.03124
+  MGRefreshSetupMaxSolverIterations = 35, 35
+  MGResetSetupMDUThreshold = 1.0
+EndExternalInverter
+\end{verbatim}
+
 \paragraph{Advanced MG parameters:}
 \begin{itemize}
   \item{ \texttt{MGCoarseSolverType}: The type of solver to be used on the intermediate and coarse grids. While this has to be specified for all levels, it will be ignored on the fine grid. Possible values:
@@ -248,10 +349,15 @@ \subsubsection{QUDA-MG interface}
 \end{itemize}
 
 \paragraph{Coarse-grid-deflated MG solver:}
-Especially with twisted mass quarks, the coarse-grid operator is very poorly conditioned. One way of improving the convergence of the solver is the scaling of the $\mu$ parameter on the coarsest and intermediate grid as described in \texttt{MGCoarseMuFactor} above. A different and more potent way which pays off for propagator inversions (but is not suitable for the HMC) is to use coarse-grid deflation. In this case, the first $n_\mathrm{ev}$ eigenvectors of the coarse-grid operator are computed and the resulting deflation subspace is used to precondition the system. In this case, experience seems to indicate that \texttt{MGCoarseMuFactor} should be set to $1.0$ everywhere as it is no longer required to improve convergence. The options of QUDA's eigensolver are set via the following set of parameters:
+Especially with twisted mass quarks, the coarse-grid operator is very poorly conditioned. One way of improving the convergence of the solver is the scaling of the $\mu$ parameter on the coarsest and intermediate grid as described in \texttt{MGCoarseMuFactor} above. A different and more potent way which pays off for propagator inversions (but is not suitable for the HMC) is to use coarse-grid deflation. In this case, the first $n_\mathrm{ev}$ eigenvectors of the coarse-grid operator are computed and the resulting deflation subspace is used to precondition the system. In this case, experience seems to indicate that \texttt{MGCoarseMuFactor} should be set to $1.0$ everywhere as it is no longer required to improve convergence.
+An example for twisted mass Wilson-clover multigrid can be found at \url{https://github.com/lattice/quda/wiki/Twisted-clover-deflated-multigrid} and some further documentation is to be found at \url{https://github.com/lattice/quda/wiki/QUDA's-eigensolvers}.
+
+The options of QUDA's eigensolver are set via the following set of parameters:
 \begin{itemize}
   \item{ \texttt{MGUseEigSolver}: Whether or not the eigensolver should be run for the operator on a per-level basis. The most common use-case would be to enable the eigensolver on the coarsest grid. (comma-separated list of booleans \emph{yes} or \emph{no}, default: \emph{no} on all levels)}
   \item{ \texttt{MGEigSolverRequireConvergence}: Require that the eigensolver actually must converge for the requested number of eigenvectors on a per-level basis as specified by \texttt{MGEigSolverNumberOfVectors}. Ignored if the corresponding \texttt{MGUseEigSolver} is set to \texttt{no}. (comma-separated list of booleans, default: \texttt{yes} on all levels)}
+  \item{ \texttt{MGEigSolverUseNormOp}: When set to \texttt{yes} on a per-level basis, the eigensolver uses $MM^\dagger$ instead of $M$. (comma-separated list of booleans, default: \texttt{no} on all levels)}
+  \item{ \texttt{MGEigSolverUseDagger}: When set to \texttt{yes} on a per-level basis, the eigensolver uses $M^\dagger$ instead of $M$ (or $M^\dagger M$ when \texttt{MGEigSolverUseNormOp} is set to \texttt{yes} on the respective level). (comma-separated list of booleans, defaullt: \texttt{no} on all levels)}
   \item{ \texttt{MGEigSolverType}: Type of eigensolver to use on a per-level basis. Possible values:
     \begin{itemize}
       \item \texttt{tr\_lanczos} (truncated Lanczos)
@@ -273,71 +379,60 @@ \subsubsection{QUDA-MG interface}
   \item{ \texttt{MGEigSolverMaxRestarts}: Perform at most this many restarts in the eigensolver specified on a per-level basis. (comma-separated list of positive integers, default: $100$ on all levels)}
   \item{ \texttt{MGEigSolverTolerance}: The tolerance to use in the eigensolver on a per-level basis. (comma-separated list of floats, default: $1.0\cdot10^{-6}$ on all levels)}
   \item{ \texttt{MGEigPreserveDeflationSubspace}: When a coarse-grid-deflated MG setup is updated between a change of the $\mu$ parameter, setting this to \texttt{yes} will preserve the deflation subspace. If set to \texttt{no}, the deflation subspace would be destroyed and regenerated upon a change of $\mu$. (boolean, default: \texttt{yes})}
-  \item{ \texttt{MGEigSolverNumberOfVectors}: Number of eigenvectors to compute on a per-level basis. Usually, this is intended for the coarsest level where, depending on the lattice size and physical parameters, numbers between $512$ and $3072$ may be required. (comma-separated list of positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGEigSolverNumberOfVectors}: Number of eigenvectors to compute on a per-level basis. Usually, this is intended for the coarsest level where, depending on the lattice size and physical parameters, numbers between $512$ and $3072$ may be required. On the levels where \texttt{MGEigSolverRequireConvergence = no} is set, this should be set to the value of \texttt{MGNumberOfVectors} on that level. (comma-separated list of positive integers, default: $0$ on all levels)}
+  \item{ \texttt{MGEigSolverKrylovSubspaceSize}: Size of the Krylov subspace to be used to be built using the eigenvectors. A good rule of thumb is to set this to a small multiple ($2\times$ or $3\times$) of the corresponding value for \texttt{MGEigSolverNumberOfVectors} for the respective level. On levels where \texttt{MGEigSolverRequireConvergence = no} is set, this should be set to the corresponding value of \texttt{MGNumberOfVectors}. (comma-separated list of positive integers, default: $0$ on all levels)}
   \item{ \texttt{MGEigSolverUsePolynomialAcceleration}: Whether or not to use polynomial acceleration in the eigensolver on a per-level basis. (comma-separated list of booleans, default: \texttt{yes} everywhere)}
+  \item{ \texttt{MGEigSolverPolynomialDegree}: Degree of the polynomial used for polynomial acceleration of the eigensolver, specified on a per-level basis. This will be ignored except if \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on a given level. Values between $50$ to $100$ seem to work well. (comma-separated list of postive integers, default: $100$ on all levels)}
+  \item{ \texttt{MGEigSolverPolyMin}: Smallest eigenvalue to suppress via polynomial acceleration. It should not be set too low and only lowered step by step from its default value. Has a large impact on the rate of convergence of the eigensolver when set correctly. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on the corresponding level. (comma-separated list of positive real numbers, default: $1.0$ on all levels)}
+  \item{ \texttt{MGEigSolverPolyMax}: Largest eigenvalue which should be suppressed via polynomial acceleration. This should be set about 10\% larger than the actual largest eigenvalue of the operator. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set. (comma-separated list of positive real numbers, default: $5.0$ on all levels)}
 \end{itemize}
 
-Note that at the time of writing (2017.12.28), only double-single mixed-precision is supported for the MG-preconditioned GCR solver and the solve will abort if a double-half precision solve is attempted. It should be noted that the null vectors are kept in half precision as recommended, however.
-
-A typical MG setup might look like this for twisted mass clover quarks: 
+A typical coarse-grid-deflated setup might look something like:
 
 \begin{verbatim}
 BeginExternalInverter QUDA
+  EnablePinnedMemoryPool = yes
+  EnableDeviceMemoryPool = no
+  Pipeline = 16
+  gcrNkrylov = 24
+  MGRunVerify = no
+  MGCoarseMuFactor = 1.0, 1.0, 1.0
   MGNumberOfLevels = 3
+  MGNumberOfVectors = 24, 32
   MGSetupSolver = cg
-  MGSetupSolverTolerance = 1e-6
-  MGSetupMaxSolverIterations = 1000
-  MGCoarseSolverTolerance = 0.25
-  MGCoarseSolverIterations = 75
-  MGSmootherTolerance = 0.25
-  MGSmootherPreIterations = 2
-  MGSmootherPostIterations = 4
-  MGOverUnderRelaxationFactor = 0.85
-  MGCoarseMuFactor = 1.0, 1.0, 12.0
-  MGNumberOfVectors = 24, 24, 32
-  MGRunVerify = yes
-  MGEnableSizeThreeBlocks = no
+  MGSetup2KappaMu = 0.000200774448
+  MGVerbosity = silent, silent, silent
+  MGBlockSizesX = 4, 2
+  MGBlockSizesY = 4, 2
+  MGBlockSizesZ = 4, 2
+  MGBlockSizesT = 4, 2
+  MGSetupSolverTolerance = 5e-7, 5e-7
+  MGSetupMaxSolverIterations = 1500, 1500
+  MGCoarseSolverType = gcr, gcr, cagcr
+  MgCoarseSolverTolerance = 0.1, 0.1, 0.1
+  MGCoarseMaxSolverIterations = 10, 10, 10
+  MGSmootherType = cagcr, cagcr, cagcr
+  MGSmootherTolerance = 0.1, 0.1, 0.1
+  MGSmootherPreIterations = 0, 0, 0
+  MGSmootherPostIterations = 2, 2, 2
+  MGOverUnderRelaxationFactor = 0.90, 0.90, 0.90
+
+  MGUseEigSolver = no, no, yes
+  MGEigSolverType = tr_lanczos, tr_lanczos, tr_lanczos
+  MGEigSolverSpectrum = smallest_real, smallest_real, smallest_real
+  MGEigPreserveDeflationSubspace = yes
+  MGEigSolverNumberOfVectors = 24, 32, 1024
+  MGEigSolverKrylovSubspaceSize = 24, 32, 2048
+  MGEigSolverRequireConvergence = no, no, yes
+  MGEigSolverMaxRestarts = 100, 100, 100
+  MGEigSolverTolerance = 1e-4, 1e-4, 1e-4
+  MGEigSolverUseNormOp  = no, no, yes
+  MGEigSolverUseDagger = no, no, no
+  MGEigSolverUsePolynomialAcceleration = no, no, yes
+  MGEigSolverPolynomialDegree = 100, 100, 100
+  MGEigSolverPolyMin = 0.6, 0.6, 0,6
+  MGEigSolverPolyMax = 8.0, 8.0, 8.0
+  MGCoarseSolverCABasisSize = 8, 8, 8
 EndExternalInverter
 \end{verbatim}
 
-Alternatively, a blocking can be specified manually:
-
-\begin{verbatim}
-BeginExternalInverter QUDA
-  MGNumberOfLevels = 3
-  MGBlockSizesX = 4, 3
-  MGBlockSizesY = 4, 3
-  MGBlockSizesZ = 6, 4
-  MGBlockSizesT = 6, 4
-  MGSetupSolver = cg
-  MGSetupSolverTolerance = 1e-6, 1e-6, 
-  MGSetupMaxSolverIterations = 1000
-  MGCoarseSolverTolerance = 0.25
-  MGCoarseSolverIterations = 75
-  MGSmootherTolerance = 0.25
-  MGSmootherPreIterations = 2
-  MGSmootherPostIterations = 4
-  MGOverUnderRelaxationFactor = 0.85
-  MGCoarseMuFactor = 1.0, 1.0, 12.0
-  MGRunVerify = yes
-  MGEnableSizeThreeBlocks = no
-EndExternalInverter
-\end{verbatim}
-
-\subsubsection{More advanced settings}
-To achieve higher performance you may choose single (default) or even half precision as sloppy precision for the inner solver of the mixed precision inverter with reliable updates. After {\ttfamily BeginOperator} and before {\ttfamily EndOperator} set {\ttfamily UseSloppyPrecision = double|single|half}.
-The MG-preconditioned GCR solver only works in double-single mixed precision, but the null vectors are stored in half precision as recommended by Kate Clark.
-
-To activate compression of the gauge fields (in order to save bandwidth and thus to achieve higher performance), set {\ttfamily UseCompression = 8|12|18} within {\ttfamily BeginOperator} and {\ttfamily EndOperator}. 
-The default is 18 which corresponds to no compression. 
-Note that if you use compression, trivial (anti)periodic boundary conditions will be applied to the gauge fields, instead of the default $\theta$-boundary conditions. 
-As a consequence, the residual check on tmLQCD side will fail. 
-Moreover, compression is not applicable when using general $\theta$-boundary conditions in the spatial directions. 
-If trying to do so, compression will be de-activated automatically and the user gets informed via the standard output.
-The \texttt{FermionBC} setting can be used to force particular temporal boundary conditions to be applied to the gauge field in the Dirac operator.
-
-\subsubsection{Functionality}
-The QUDA interface can currently be used to invert {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} within a 4D multi-GPU (MPI) parallel environment with CG, BICGSTAB or MG-preconditioned GCR. QUDA uses even-odd preconditioning, if wanted ({\ttfamily UseEvenOdd = yes}), and the interface is set up to use a mixed precision solver by default. For more details on the QUDA settings check the function {\ttfamily \_initQuda()} in {\ttfamily quda\_interface.c}.
-
-
-

From 3ffdd183e5ae1ef311da720dd9e779573a33b9ff Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 16 May 2023 19:51:12 +0200
Subject: [PATCH 25/49] fix typo

---
 read_input.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read_input.l b/read_input.l
index 63ca96c3e..a32f72483 100644
--- a/read_input.l
+++ b/read_input.l
@@ -3841,7 +3841,7 @@ int read_input(char * conf_file){
   quda_input.fermionbc = TM_QUDA_THETABC;
   quda_input.pipeline = 0;
   quda_input.gcrNkrylov = 10;
-  quda_input.coarse_guess = QUDA_BOOLEAN_NO;
+  quda_input.mg_coarse_guess = QUDA_BOOLEAN_NO;
   quda_input.reliable_delta = 1e-3; // anything smaller than this and we break down in double-half
   quda_input.mg_n_level = _default_quda_mg_n_level;
   quda_input.mg_setup_2kappamu = _default_quda_mg_setup_2kappamu;

From 3efedbd256b7168976979618a4322f1cc59bef90 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 16 May 2023 20:19:24 +0200
Subject: [PATCH 26/49] write out gauge field when the solver fails during
 monomial_solve

---
 solver/monomial_solve.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/solver/monomial_solve.c b/solver/monomial_solve.c
index 66684bb85..3455b375c 100644
--- a/solver/monomial_solve.c
+++ b/solver/monomial_solve.c
@@ -82,8 +82,10 @@
 #endif
 #include "fatal_error.h"
 
-#include <io/params.h>
-#include <io/spinor.h>
+#include "io/params.h"
+#include "io/spinor.h"
+#include "io/gauge.h"
+#include "measure_gauge_action.h"
 
 #ifdef TM_USE_QUDA
 #  include "quda_interface.h"
@@ -216,6 +218,11 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
+    char outname[200];
+    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
+    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+    int status = write_gauge_field(outname, 64, xlfInfo);
+    free(xlfInfo);
     fatal_error("Error: solver reported -1 iterations.", "solve_degenerate");
   }
 
@@ -425,6 +432,11 @@ int solve_mms_tm(spinor ** const P, spinor * const Q,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
+    char outname[200];
+    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
+    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+    int status = write_gauge_field(outname, 64, xlfInfo);
+    free(xlfInfo);
     fatal_error("Error: solver reported -1 iterations.", "solve_mms_tm");
   }
 
@@ -671,6 +683,11 @@ int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
+    char outname[200];
+    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
+    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+    int status = write_gauge_field(outname, 64, xlfInfo);
+    free(xlfInfo);
     fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd");
   }
 
@@ -726,6 +743,11 @@ int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
+    char outname[200];
+    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
+    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+    int status = write_gauge_field(outname, 64, xlfInfo);
+    free(xlfInfo);
     fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd_plus");
   }
 

From 466105b725e48984490a254e5c6daef831af21df Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 17 May 2023 10:44:19 +0200
Subject: [PATCH 27/49] encapsulate monomial solve failure into function and
 invert ordering of gauge_id and nstore

---
 solver/monomial_solve.c | 37 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/solver/monomial_solve.c b/solver/monomial_solve.c
index 3455b375c..805899a5d 100644
--- a/solver/monomial_solve.c
+++ b/solver/monomial_solve.c
@@ -91,6 +91,15 @@
 #  include "quda_interface.h"
 #endif
 
+void solve_fail_write_config_and_abort(const char * const solver) {
+  char outname[200];
+  snprintf(outname, 200, "conf_monomial_solve_fail.%.6f.%04d", g_gauge_state.gauge_id, nstore);
+  paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+  int status = write_gauge_field(outname, 64, xlfInfo);
+  free(xlfInfo);
+  fatal_error("Error: solver reported -1 iterations.", solver);
+}
+
 int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params,
                      const int max_iter, double eps_sq, const int rel_prec, 
                      const int N, matrix_mult f, int solver_type){
@@ -218,12 +227,7 @@ int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    char outname[200];
-    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
-    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
-    int status = write_gauge_field(outname, 64, xlfInfo);
-    free(xlfInfo);
-    fatal_error("Error: solver reported -1 iterations.", "solve_degenerate");
+    solve_fail_write_config_and_abort("solve_degenerate");
   }
 
   return (iteration_count);
@@ -432,12 +436,7 @@ int solve_mms_tm(spinor ** const P, spinor * const Q,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    char outname[200];
-    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
-    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
-    int status = write_gauge_field(outname, 64, xlfInfo);
-    free(xlfInfo);
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_tm");
+    solve_fail_write_config_and_abort("solve_mms_tm");
   }
 
   return(iteration_count);
@@ -683,12 +682,7 @@ int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    char outname[200];
-    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
-    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
-    int status = write_gauge_field(outname, 64, xlfInfo);
-    free(xlfInfo);
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd");
+    solve_fail_write_config_and_abort("solve_mms_nd");
   }
 
   return (iteration_count);
@@ -743,12 +737,7 @@ int solve_mms_nd_plus(spinor ** const Pup, spinor ** const Pdn,
   tm_stopwatch_pop(&g_timers, 0, 1, "");
 
   if (iteration_count == -1 && g_barrier_monomials_convergence) {
-    char outname[200];
-    snprintf(outname, 200, "conf_monomial_solve_fail.%04d.%.6f", nstore, g_gauge_state.gauge_id);
-    paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
-    int status = write_gauge_field(outname, 64, xlfInfo);
-    free(xlfInfo);
-    fatal_error("Error: solver reported -1 iterations.", "solve_mms_nd_plus");
+    solve_fail_write_config_and_abort("solve_mms_nd_plus");
   }
 
   return iteration_count;

From 2d05dfe27a1890af69a3efaaf6c6a1d0a1a257d5 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 22 May 2023 10:36:41 +0200
Subject: [PATCH 28/49] write out failing gauge configuration also if the
 failure happens after an MG refresh

---
 quda_interface.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/quda_interface.c b/quda_interface.c
index 9922ef3ae..cd134e45c 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -115,6 +115,8 @@
 #include "tm_debug_printf.h"
 #include "phmc.h"
 #include "quda_gauge_paths.inc"
+#include "io/gauge.h"
+#include "measure_gauge_action.h"
 
 // nstore is generally like a gauge id, for measurements it identifies the gauge field
 // uniquely 
@@ -2223,9 +2225,16 @@ int invert_eo_degenerate_quda(spinor * const out,
                                                   rel_prec, even_odd_flag, solver_params,
                                                   sloppy_precision, compression, QpQm);
         if (ret_value >= max_iter) {
+          char outname[200];
+          snprintf(outname, 200, "conf_mg_refresh_fail.%.6f.%04d", g_gauge_state.gauge_id, nstore);
+          paramsXlfInfo * xlfInfo = construct_paramsXlfInfo(
+              measure_plaquette((const su3**)g_gauge_field)/(6.*VOLUME*g_nproc), nstore);
+          int status = write_gauge_field(outname, 64, xlfInfo);
+          free(xlfInfo);
+
           char errmsg[200];
           snprintf(errmsg, 200, "QUDA-MG solver failed to converge in %d iterations even after forced setup refresh. Terminating!",
-                   max_iter);
+                 max_iter);
           fatal_error(errmsg, __func__);
           return -1;
         } else {

From 5d5e6c494b96099664bc73dc8177db2284c6fdd4 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 22 May 2023 10:46:01 +0200
Subject: [PATCH 29/49] set BarrierMonimialsConverge to default to YES

---
 default_input_values.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/default_input_values.h b/default_input_values.h
index b6b30f690..bf7ceaf48 100644
--- a/default_input_values.h
+++ b/default_input_values.h
@@ -204,7 +204,7 @@
 #define _default_subprocess_flag 0
 #define _default_lowmem_flag 0
 
-#define _default_g_barrier_monomials_convergence 0
+#define _default_g_barrier_monomials_convergence 1
 
 /* default input values for QUDA interface */
 /* These follow the recommendations of https://github.com/lattice/quda/wiki/Multigrid-Solver */

From f6d68c8cda2f2eee8c034f37d874d279eb5ae020 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 24 May 2023 18:41:56 +0200
Subject: [PATCH 30/49] first stab at using polynomial acceleration for the ND
 eigensolver

---
 quda_interface.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 694647aa9..267a96d54 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -5,6 +5,8 @@
  *               2018       Bartosz Kostrzewa, Ferenc Pittler
  *               2019, 2020 Bartosz Kostrzewa
  *               2021       Bartosz Kostrzewa, Marco Garofalo, Ferenc Pittler, Simone Bacchio
+ *               2022       Simone Romiti, Bartosz Kostrzewa
+ *               2023       Aniket Sen, Bartosz Kostrzewa
  *
  * This file is part of tmLQCD.
  *
@@ -2673,9 +2675,14 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
     eig_param.use_norm_op = QUDA_BOOLEAN_FALSE;
   }
 
-  /* Not using polynomial acceleration for now.
-   * Might be useful to add the support. */
-  eig_param.use_poly_acc = QUDA_BOOLEAN_FALSE;
+  // BK: these defaults seem to work on a 32c64 ensemble
+  // at a relatively coarse lattice spacing for the eigenvalues
+  // of the twisted-clover ND operator with values of musigma / mudelta
+  // reproducing physical sea strange and charm quark masses
+  eig_param.use_poly_acc = maxmin == 1 ? QUDA_BOOLEAN_FALSE : QUDA_BOOLEAN_TRUE;
+  eig_param.poly_deg = 128;
+  eig_param.a_min = 1e-3;
+  eig_param.a_max = 4;
   
   /* Daggers the operator. Not necessary for 
    * most cases. */
@@ -2726,6 +2733,7 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   eigensolveQuda(NULL, eigenvls, &eig_param);
 
   returnvalue = eigenvls[0];
+  free(eigenvls);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
 

From 4e6033f3d0633b87d1ed3e20e2ce2cac358de178 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Fri, 9 Jun 2023 15:19:56 +0200
Subject: [PATCH 31/49] use own QudaInvertParam struct for
 eig_param.invert_param to not disturb global inv_param when changing
 precision, adjust indentation in one place

---
 quda_interface.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 267a96d54..4a985641c 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -247,7 +247,7 @@ void _setDefaultQudaParam(void){
   QudaPrecision cpu_prec  = QUDA_DOUBLE_PRECISION;
   QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION;
   QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
-  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;
+  QudaPrecision cuda_prec_precondition = QUDA_SINGLE_PRECISION;
 
   QudaTune tune = QUDA_TUNE_YES;
 
@@ -2608,17 +2608,17 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   // it returns if quda is already init
   _initQuda();
 
-    if ( rel_prec )
-      inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
-    else
-      inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
+  if ( rel_prec )
+    inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL;
+  else
+    inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL;
 
-    inv_param.kappa = g_kappa;
-    
-    // figure out which BC tu use (theta, trivial...)
-    set_boundary_conditions(&compression, &gauge_param);
+  inv_param.kappa = g_kappa;
+  
+  // figure out which BC tu use (theta, trivial...)
+  set_boundary_conditions(&compression, &gauge_param);
 
-    set_sloppy_prec(sloppy_precision, refinement_precision, &gauge_param, &inv_param);
+  set_sloppy_prec(sloppy_precision, refinement_precision, &gauge_param, &inv_param);
 
   // load gauge after setting precision
   _loadGaugeQuda(compression);
@@ -2640,8 +2640,18 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
 
   // create new eig_param
   eig_param = newQudaEigParam();
-
-  eig_param.invert_param = &inv_param;
+  
+  // need our own QudaInvertParam for passing the operator properties
+  // as we modify the precision below 
+  QudaInvertParam eig_invert_param = newQudaInvertParam();
+  eig_invert_param = inv_param;
+  eig_param.invert_param = &eig_invert_param;
+  /* AS The following two are set to cuda_prec, otherwise 
+   * it gives an error. Such high precision might not be
+   * necessary. But have not found a way to consistently set
+   * the different precisions. */
+  eig_param.invert_param->cuda_prec_eigensolver = inv_param.cuda_prec;
+  eig_param.invert_param->clover_cuda_prec_eigensolver = inv_param.clover_cuda_prec;
   
   if(tol < 1.e-14) {
     eig_param.tol = 1.e-14;
@@ -2703,12 +2713,6 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   if(maxmin == 1) eig_param.spectrum = QUDA_SPECTRUM_LR_EIG;
   else eig_param.spectrum = QUDA_SPECTRUM_SR_EIG;
 
-  /* The following two are set to cuda_prec, otherwise 
-   * it gives an error. Such high precision might not be
-   * necessary. But have not found a way to consistently set
-   * the different precisions. */
-  eig_param.invert_param->cuda_prec_eigensolver = inv_param.cuda_prec;
-  eig_param.invert_param->clover_cuda_prec_eigensolver = inv_param.cuda_prec;
 
   /* At the moment, the eigenvalues and eigenvectors are neither 
    * written to or read from disk, but if necessary, can be added

From 87e6b1c84fcc98d7799328d284ad359655a258d2 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Sat, 10 Jun 2023 23:28:25 +0300
Subject: [PATCH 32/49] fix coarse-grid-deflated MG and update documentation
 with instructions on how to determine the range of the spectrum of the coarse
 grid operator to optimise polynomial acceleration

---
 doc/quda.tex     | 16 ++++++++++++----
 quda_interface.c | 17 +++++++++++------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/doc/quda.tex b/doc/quda.tex
index 00baa73b9..44d46ef79 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -421,8 +421,8 @@ \subsubsection{QUDA-MG interface}
   MGEigSolverType = tr_lanczos, tr_lanczos, tr_lanczos
   MGEigSolverSpectrum = smallest_real, smallest_real, smallest_real
   MGEigPreserveDeflationSubspace = yes
-  MGEigSolverNumberOfVectors = 24, 32, 1024
-  MGEigSolverKrylovSubspaceSize = 24, 32, 2048
+  MGEigSolverNumberOfVectors = 0, 0, 1024
+  MGEigSolverKrylovSubspaceSize = 0, 0, 3072
   MGEigSolverRequireConvergence = no, no, yes
   MGEigSolverMaxRestarts = 100, 100, 100
   MGEigSolverTolerance = 1e-4, 1e-4, 1e-4
@@ -430,9 +430,17 @@ \subsubsection{QUDA-MG interface}
   MGEigSolverUseDagger = no, no, no
   MGEigSolverUsePolynomialAcceleration = no, no, yes
   MGEigSolverPolynomialDegree = 100, 100, 100
-  MGEigSolverPolyMin = 0.6, 0.6, 0,6
-  MGEigSolverPolyMax = 8.0, 8.0, 8.0
+  MGEigSolverPolyMin = 0.0, 0.0, 0.01
+  MGEigSolverPolyMax = 0.0, 0.0, 4.0
   MGCoarseSolverCABasisSize = 8, 8, 8
 EndExternalInverter
 \end{verbatim}
 
+In order to determine approppriate values for \texttt{MGEigSolverPolyMin} and \texttt{MGEigSolverPolyMax}, one should run the eigensolver once without polynomial acceleration (\texttt{MGEigSolverUsePolynomialAcceleration = no}) and run tmLQCD at \texttt{DebugLevel = 3} to trigger the eigenvalues to be printed by the eigensolver.
+Then, setting \texttt{MGEigSolverSpectrum = largest\_real}, \texttt{MGEigSolverNumberOfVectors = 1} and \texttt{MGEigSolverKrylovSubspaceSize = 16} (all on the coarsest grid, the third number in the example above), one can easily obtain the largest eigenvalue of the coarse grid operator for a particular configuration or set of configurations.
+Increasing this number by 10\% or so gives \texttt{MGEigSolverPolyMax}.
+Subsequently, setting \texttt{MGEigSolverSpectrum = smallest\_real}, \texttt{MGEigSolverNumberOfVectors = 0, 0, 1024}, \texttt{MGEigSolverKrylovSubspaceSize = 0, 0, 3072} will give the requisite 1024 smallest eigenvalues.
+Noting the largest of these, the next largest order of magnitude can be used for \texttt{MGEigSolverPolyMin}.
+In other words, if the largest of these smallest eigenvalues is $4\cdot10^{-3}$, for example, then \texttt{MGEigSolverPolyMin} can be set to 0.01.
+This ensures that the desired (smallest) part of the spectrum is smaller than \texttt{MGEigSolverPolyMin} and that the entire spectrum is contained in the range up to \texttt{MGEigSolverPolyMax}.
+After this, polynomial acceleration can be enabled, which should reduce setup time significantly.
diff --git a/quda_interface.c b/quda_interface.c
index bde44916c..06449c5c1 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -241,7 +241,7 @@ void _setDefaultQudaParam(void){
   QudaPrecision cpu_prec  = QUDA_DOUBLE_PRECISION;
   QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION;
   QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION;
-  QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION;
+  QudaPrecision cuda_prec_precondition = QUDA_SINGLE_PRECISION;
 
   QudaTune tune = QUDA_TUNE_YES;
 
@@ -1836,15 +1836,18 @@ void _setMGInvertParam(QudaInvertParam * mg_inv_param, const QudaInvertParam * c
   mg_inv_param->cuda_prec = inv_param->cuda_prec;
   mg_inv_param->cuda_prec_sloppy = inv_param->cuda_prec_sloppy;
   mg_inv_param->cuda_prec_refinement_sloppy = inv_param->cuda_prec_refinement_sloppy;
-  mg_inv_param->cuda_prec_precondition = inv_param->cuda_prec_precondition;
-  mg_inv_param->cuda_prec_eigensolver = inv_param->cuda_prec_eigensolver;
   
   mg_inv_param->clover_cpu_prec = inv_param->clover_cpu_prec;
   mg_inv_param->clover_cuda_prec = inv_param->clover_cuda_prec;
   mg_inv_param->clover_cuda_prec_sloppy = inv_param->clover_cuda_prec_sloppy;
   mg_inv_param->clover_cuda_prec_refinement_sloppy = inv_param->clover_cuda_prec_refinement_sloppy;
-  mg_inv_param->clover_cuda_prec_precondition = inv_param->clover_cuda_prec_precondition;
-  mg_inv_param->clover_cuda_prec_eigensolver = inv_param->clover_cuda_prec_eigensolver;
+  
+  // it seems that the MG-internal preconditioner and eigensolver need to be
+  // consistent with sloppy precision
+  mg_inv_param->cuda_prec_precondition = inv_param->cuda_prec_sloppy;
+  mg_inv_param->cuda_prec_eigensolver = inv_param->cuda_prec_sloppy;
+  mg_inv_param->clover_cuda_prec_precondition = inv_param->clover_cuda_prec_sloppy;
+  mg_inv_param->clover_cuda_prec_eigensolver = inv_param->clover_cuda_prec_sloppy;
   
   mg_inv_param->clover_order = inv_param->clover_order;
   mg_inv_param->gcrNkrylov = inv_param->gcrNkrylov;
@@ -2040,7 +2043,9 @@ void _setQudaMultigridParam(QudaMultigridParam* mg_param) {
 
       mg_eig_param[level].n_ev = quda_input.mg_eig_nEv[level];
       mg_eig_param[level].n_kr = quda_input.mg_eig_nKr[level];
-      mg_eig_param[level].n_conv = quda_input.mg_n_vec[level];
+      mg_eig_param[level].n_conv = quda_input.mg_eig_nEv[level]; // require convergence of all eigenvalues
+      mg_eig_param[level].n_ev_deflate = mg_eig_param[level].n_conv; // deflate all converged eigenvalues
+      // TODO expose this setting: mg_eig_param[level].batched_rotate = 128;
       mg_eig_param[level].require_convergence = quda_input.mg_eig_require_convergence[level];
 
       mg_eig_param[level].tol = quda_input.mg_eig_tol[level];

From 21dbfaa05136b6488e6638cdfa341c71e2c50223 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Mon, 12 Jun 2023 16:01:15 +0200
Subject: [PATCH 33/49] some small corrections in the documentation of the
 coarse-grid-deflated MG

---
 doc/quda.tex | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/quda.tex b/doc/quda.tex
index 44d46ef79..81b7b6b83 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -384,7 +384,7 @@ \subsubsection{QUDA-MG interface}
   \item{ \texttt{MGEigSolverUsePolynomialAcceleration}: Whether or not to use polynomial acceleration in the eigensolver on a per-level basis. (comma-separated list of booleans, default: \texttt{yes} everywhere)}
   \item{ \texttt{MGEigSolverPolynomialDegree}: Degree of the polynomial used for polynomial acceleration of the eigensolver, specified on a per-level basis. This will be ignored except if \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on a given level. Values between $50$ to $100$ seem to work well. (comma-separated list of postive integers, default: $100$ on all levels)}
   \item{ \texttt{MGEigSolverPolyMin}: Smallest eigenvalue to suppress via polynomial acceleration. It should not be set too low and only lowered step by step from its default value. Has a large impact on the rate of convergence of the eigensolver when set correctly. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set on the corresponding level. (comma-separated list of positive real numbers, default: $1.0$ on all levels)}
-  \item{ \texttt{MGEigSolverPolyMax}: Largest eigenvalue which should be suppressed via polynomial acceleration. This should be set about 10\% larger than the actual largest eigenvalue of the operator. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set. (comma-separated list of positive real numbers, default: $5.0$ on all levels)}
+  \item{ \texttt{MGEigSolverPolyMax}: Largest eigenvalue which should be suppressed via polynomial acceleration. This should be set about 20\% larger than the actual largest eigenvalue of the operator. Ignored unless \texttt{MGEigSolverUsePolynomialAcceleration = yes} is set. (comma-separated list of positive real numbers, default: $5.0$ on all levels)}
 \end{itemize}
 
 A typical coarse-grid-deflated setup might look something like:
@@ -424,21 +424,21 @@ \subsubsection{QUDA-MG interface}
   MGEigSolverNumberOfVectors = 0, 0, 1024
   MGEigSolverKrylovSubspaceSize = 0, 0, 3072
   MGEigSolverRequireConvergence = no, no, yes
-  MGEigSolverMaxRestarts = 100, 100, 100
+  MGEigSolverMaxRestarts = 0, 0, 100
   MGEigSolverTolerance = 1e-4, 1e-4, 1e-4
   MGEigSolverUseNormOp  = no, no, yes
   MGEigSolverUseDagger = no, no, no
   MGEigSolverUsePolynomialAcceleration = no, no, yes
-  MGEigSolverPolynomialDegree = 100, 100, 100
+  MGEigSolverPolynomialDegree = 0, 0, 100
   MGEigSolverPolyMin = 0.0, 0.0, 0.01
-  MGEigSolverPolyMax = 0.0, 0.0, 4.0
-  MGCoarseSolverCABasisSize = 8, 8, 8
+  MGEigSolverPolyMax = 0.0, 0.0, 3.6
+  MGCoarseSolverCABasisSize = 4, 4, 4
 EndExternalInverter
 \end{verbatim}
 
 In order to determine approppriate values for \texttt{MGEigSolverPolyMin} and \texttt{MGEigSolverPolyMax}, one should run the eigensolver once without polynomial acceleration (\texttt{MGEigSolverUsePolynomialAcceleration = no}) and run tmLQCD at \texttt{DebugLevel = 3} to trigger the eigenvalues to be printed by the eigensolver.
 Then, setting \texttt{MGEigSolverSpectrum = largest\_real}, \texttt{MGEigSolverNumberOfVectors = 1} and \texttt{MGEigSolverKrylovSubspaceSize = 16} (all on the coarsest grid, the third number in the example above), one can easily obtain the largest eigenvalue of the coarse grid operator for a particular configuration or set of configurations.
-Increasing this number by 10\% or so gives \texttt{MGEigSolverPolyMax}.
+Increasing this number by 20\% or so gives a good value for \texttt{MGEigSolverPolyMax}.
 Subsequently, setting \texttt{MGEigSolverSpectrum = smallest\_real}, \texttt{MGEigSolverNumberOfVectors = 0, 0, 1024}, \texttt{MGEigSolverKrylovSubspaceSize = 0, 0, 3072} will give the requisite 1024 smallest eigenvalues.
 Noting the largest of these, the next largest order of magnitude can be used for \texttt{MGEigSolverPolyMin}.
 In other words, if the largest of these smallest eigenvalues is $4\cdot10^{-3}$, for example, then \texttt{MGEigSolverPolyMin} can be set to 0.01.

From 3d67264370cce89aca9872bcb72a2337cd2c825f Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Sat, 17 Jun 2023 10:51:44 +0200
Subject: [PATCH 34/49] user input for poly acc settings

---
 default_input_values.h |  4 ++++
 monomial/monomial.c    |  5 +++++
 monomial/monomial.h    |  2 ++
 phmc.c                 |  6 ++++--
 quda_interface.c       | 14 ++++++++------
 quda_interface.h       |  3 ++-
 read_input.l           | 20 ++++++++++++++++++++
 7 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/default_input_values.h b/default_input_values.h
index 64b1a3531..820f3d98e 100644
--- a/default_input_values.h
+++ b/default_input_values.h
@@ -147,6 +147,10 @@
 #define _default_phmc_pure_phmc 0
 #define _default_stilde_max 3.
 #define _default_stilde_min 0.01
+#define _default_eig_polydeg 128
+#define _default_eig_amin 0.001
+#define _default_eig_amax 4
+#define _default_eig_n_kr 96
 #define _default_degree_of_p 48
 #define _default_propagator_splitted 1
 #define _default_source_splitted 1
diff --git a/monomial/monomial.c b/monomial/monomial.c
index 4214fef5d..b7e8a55c5 100644
--- a/monomial/monomial.c
+++ b/monomial/monomial.c
@@ -144,6 +144,11 @@ int add_monomial(const int type) {
   monomial_list[no_monomials].PrecisionHfinal = _default_g_acc_Hfin;
   monomial_list[no_monomials].PrecisionPtilde = _default_g_acc_Ptilde;
 
+  monomial_list[no_monomials].eig_polydeg = _default_eig_polydeg;
+  monomial_list[no_monomials].eig_amin = _default_eig_amin;
+  monomial_list[no_monomials].eig_amax = _default_eig_amax;
+  monomial_list[no_monomials].eig_n_kr = _default_eig_n_kr;
+
   monomial_list[no_monomials].rat.order = 12;
   monomial_list[no_monomials].rat.range[0] = _default_stilde_min;
   monomial_list[no_monomials].rat.range[1] = _default_stilde_max;
diff --git a/monomial/monomial.h b/monomial/monomial.h
index c8f7e8b40..bbdb30dd2 100644
--- a/monomial/monomial.h
+++ b/monomial/monomial.h
@@ -114,6 +114,8 @@ typedef struct {
   double PrecisionHfinal;
   double StildeMin, StildeMax;
   double EVMin, EVMax, EVMaxInv;
+  int eig_polydeg, eig_n_kr;
+  double eig_amin, eig_amax;
   ExternalLibrary external_library;
   ExternalEigSolver external_eigsolver;
   double * MDPolyCoefs, * PtildeCoefs;
diff --git a/phmc.c b/phmc.c
index 32900382e..cf4e73b14 100644
--- a/phmc.c
+++ b/phmc.c
@@ -231,7 +231,8 @@ void phmc_compute_ev(const int trajectory_counter,
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
   #ifdef TM_USE_QUDA
     temp = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0,
-                        mnl->accprec, mnl->maxiter, mnl->solver, g_relative_precision_flag,
+                        mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                        mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
                                                        1, // we only support even-odd here
                                                        mnl->solver_params.refinement_precision,
                                                        mnl->solver_params.sloppy_precision,
@@ -257,7 +258,8 @@ void phmc_compute_ev(const int trajectory_counter,
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
   #ifdef TM_USE_QUDA
     temp2 = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1,
-                         mnl->accprec, mnl->maxiter, mnl->solver, g_relative_precision_flag,
+                         mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                         mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
                                                        1, // we only support even-odd here
                                                        mnl->solver_params.refinement_precision,
                                                        mnl->solver_params.sloppy_precision,
diff --git a/quda_interface.c b/quda_interface.c
index 4a985641c..c17b2015f 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2598,7 +2598,8 @@ Interface function for Eigensolver on Quda
 
 
 double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
-                    const double precision, const int max_iter, const int solver_flag, const int rel_prec,
+                    const double precision, const int max_iter, const int polydeg, const double amin, 
+                    const double amax, const int n_kr, const int solver_flag, const int rel_prec,
                     const int even_odd_flag, const SloppyPrecision refinement_precision,
                     SloppyPrecision sloppy_precision, CompressionType compression) {
 
@@ -2646,6 +2647,7 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   QudaInvertParam eig_invert_param = newQudaInvertParam();
   eig_invert_param = inv_param;
   eig_param.invert_param = &eig_invert_param;
+  eig_param.invert_param->verbosity = QUDA_VERBOSE;
   /* AS The following two are set to cuda_prec, otherwise 
    * it gives an error. Such high precision might not be
    * necessary. But have not found a way to consistently set
@@ -2689,10 +2691,10 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   // at a relatively coarse lattice spacing for the eigenvalues
   // of the twisted-clover ND operator with values of musigma / mudelta
   // reproducing physical sea strange and charm quark masses
-  eig_param.use_poly_acc = maxmin == 1 ? QUDA_BOOLEAN_FALSE : QUDA_BOOLEAN_TRUE;
-  eig_param.poly_deg = 128;
-  eig_param.a_min = 1e-3;
-  eig_param.a_max = 4;
+  eig_param.use_poly_acc = (maxmin == 1) && (polydeg != 0) ? QUDA_BOOLEAN_FALSE : QUDA_BOOLEAN_TRUE;
+  eig_param.poly_deg = polydeg;
+  eig_param.a_min = amin;
+  eig_param.a_max = amax;
   
   /* Daggers the operator. Not necessary for 
    * most cases. */
@@ -2730,7 +2732,7 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
    * From my understanding, QUDA automatically scales
    * this search space, however more testing on this 
    * might be necessary */
-  eig_param.n_kr = 96;
+  eig_param.n_kr = n_kr;
 
   eig_param.max_restarts = max_iterations;
 
diff --git a/quda_interface.h b/quda_interface.h
index 5320fd5ea..13fb578c9 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -176,7 +176,8 @@ void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FIL
 
 
 double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
-                    const double precision, const int max_iter, const int solver_flag, const int rel_prec,
+                    const double precision, const int max_iter, const int polydeg, const double amin, 
+                    const double amax, const int n_kr, const int solver_flag, const int rel_prec,
                     const int even_odd_flag, const SloppyPrecision refinement_precision,
                     SloppyPrecision sloppy_precision, CompressionType compression);
 
diff --git a/read_input.l b/read_input.l
index 4e7b0db83..3fde529d0 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2607,6 +2607,26 @@ static inline double fltlist_next_token(int * const list_end){
     if(myverbose) printf("  Do not use external eigensolver line %d monomial %d\n", line_of_file, current_monomial);
     mnl->external_eigsolver = NO_EXT_EIGSOLVER;
   }
+  {SPC}*EigAmin{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mnl->eig_amin = c;
+    if(myverbose!=0) printf("  eig_amin set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+  {SPC}*EigAmax{EQL}{FLT} {
+    sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
+    mnl->eig_amax = c;
+    if(myverbose!=0) printf("  eig_amax set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
+  }
+  {SPC}*EigPolyDeg{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mnl->eig_polydeg = a;
+    if(myverbose!=0) printf("  eig_polydeg set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
+  }
+  {SPC}*EigNkr{EQL}{DIGIT}+ {
+    sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
+    mnl->eig_n_kr = a;
+    if(myverbose!=0) printf("  eig_n_kr set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
+  }
 }
 <NDPOLYMONOMIAL,CLPOLYMONOMIAL>{
   {SPC}*MaxPtildeDegree{EQL}{DIGIT}+ {

From 362c93b942094d93961e17771e042f28a9bafa02 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 21 Jun 2023 10:02:51 +0200
Subject: [PATCH 35/49] poly acc inputs updated

---
 read_input.l | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/read_input.l b/read_input.l
index 3fde529d0..54fe91b8a 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2607,25 +2607,25 @@ static inline double fltlist_next_token(int * const list_end){
     if(myverbose) printf("  Do not use external eigensolver line %d monomial %d\n", line_of_file, current_monomial);
     mnl->external_eigsolver = NO_EXT_EIGSOLVER;
   }
-  {SPC}*EigAmin{EQL}{FLT} {
+  {SPC}*EigSolverPolyMin{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->eig_amin = c;
-    if(myverbose!=0) printf("  eig_amin set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  min for polynomial acceleration in eigensolver set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
   }
-  {SPC}*EigAmax{EQL}{FLT} {
+  {SPC}*EigSolverPolyMax{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->eig_amax = c;
-    if(myverbose!=0) printf("  eig_amax set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  max for polynomial acceleration in eigensolver set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
   }
-  {SPC}*EigPolyDeg{EQL}{DIGIT}+ {
+  {SPC}*EigSolverPolynomialDegree{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
     mnl->eig_polydeg = a;
-    if(myverbose!=0) printf("  eig_polydeg set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  degree of polynomial acceleration in eigensolver set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
   }
-  {SPC}*EigNkr{EQL}{DIGIT}+ {
+  {SPC}*EigSolverKrylovSubspaceSize{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
     mnl->eig_n_kr = a;
-    if(myverbose!=0) printf("  eig_n_kr set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  Krylov subspace size for eigensolver set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
   }
 }
 <NDPOLYMONOMIAL,CLPOLYMONOMIAL>{

From 8bbabe6af8271e3a53727c71b2611d456cb63aec Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Tue, 27 Jun 2023 17:33:26 +0200
Subject: [PATCH 36/49] added support for one flavour solver

---
 phmc.c           |  4 ++--
 quda_interface.c | 14 ++++++++++----
 quda_interface.h |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/phmc.c b/phmc.c
index cf4e73b14..d4c168eac 100644
--- a/phmc.c
+++ b/phmc.c
@@ -236,7 +236,7 @@ void phmc_compute_ev(const int trajectory_counter,
                                                        1, // we only support even-odd here
                                                        mnl->solver_params.refinement_precision,
                                                        mnl->solver_params.sloppy_precision,
-                                                       mnl->solver_params.compression_type);
+                                                       mnl->solver_params.compression_type, 0);
     if( fabs(mnl->EVMax - 1) < 2*DBL_EPSILON ) {
       temp = temp / mnl->StildeMax;
     }
@@ -263,7 +263,7 @@ void phmc_compute_ev(const int trajectory_counter,
                                                        1, // we only support even-odd here
                                                        mnl->solver_params.refinement_precision,
                                                        mnl->solver_params.sloppy_precision,
-                                                       mnl->solver_params.compression_type);
+                                                       mnl->solver_params.compression_type, 0);
     if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
       temp2 = temp2 / mnl->StildeMax;
     }
diff --git a/quda_interface.c b/quda_interface.c
index c17b2015f..9a6e5ef9b 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2601,7 +2601,7 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
                     const double precision, const int max_iter, const int polydeg, const double amin, 
                     const double amax, const int n_kr, const int solver_flag, const int rel_prec,
                     const int even_odd_flag, const SloppyPrecision refinement_precision,
-                    SloppyPrecision sloppy_precision, CompressionType compression) {
+                    SloppyPrecision sloppy_precision, CompressionType compression, const int oneFlavourFlag) {
 
   tm_stopwatch_push(&g_timers, __func__, "");
   
@@ -2624,9 +2624,15 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   // load gauge after setting precision
   _loadGaugeQuda(compression);
 
-  _setTwoFlavourSolverParam(g_kappa, g_c_sw, g_mubar, g_epsbar, solver_flag, even_odd_flag, precision, max_iter,
-                            1 /*single_parity_solve */,
-                            1 /*always QpQm*/);
+  if ( oneFlavourFlag ) {
+    _setOneFlavourSolverParam(g_kappa, g_c_sw, g_mu, solver_flag, even_odd_flag, precision, max_iter,
+                              1 /*single_parity_solve */,
+                              1 /*always QpQm*/);
+  }else {
+    _setTwoFlavourSolverParam(g_kappa, g_c_sw, g_mubar, g_epsbar, solver_flag, even_odd_flag, precision, max_iter,
+                              1 /*single_parity_solve */,
+                              1 /*always QpQm*/);
+  }
 
   // QUDA applies the MMdag operator, we need QpQm^{-1) in the end
   // so we want QUDA to use the MdagM operator
diff --git a/quda_interface.h b/quda_interface.h
index 13fb578c9..eb0ee4447 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -179,6 +179,6 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
                     const double precision, const int max_iter, const int polydeg, const double amin, 
                     const double amax, const int n_kr, const int solver_flag, const int rel_prec,
                     const int even_odd_flag, const SloppyPrecision refinement_precision,
-                    SloppyPrecision sloppy_precision, CompressionType compression);
+                    SloppyPrecision sloppy_precision, CompressionType compression, const int oneFlavourFlag);
 
 #endif /* QUDA_INTERFACE_H_ */

From 721f3ba478fdeaeee8e38791c46cdc83469ed5c6 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 28 Jun 2023 10:29:48 +0200
Subject: [PATCH 37/49] fixed bug in setting eig_param.use_poly_acc

---
 quda_interface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quda_interface.c b/quda_interface.c
index 9a6e5ef9b..eeccb6d71 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2697,7 +2697,7 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   // at a relatively coarse lattice spacing for the eigenvalues
   // of the twisted-clover ND operator with values of musigma / mudelta
   // reproducing physical sea strange and charm quark masses
-  eig_param.use_poly_acc = (maxmin == 1) && (polydeg != 0) ? QUDA_BOOLEAN_FALSE : QUDA_BOOLEAN_TRUE;
+  eig_param.use_poly_acc = (maxmin == 1) || (polydeg == 0) ? QUDA_BOOLEAN_FALSE : QUDA_BOOLEAN_TRUE;
   eig_param.poly_deg = polydeg;
   eig_param.a_min = amin;
   eig_param.a_max = amax;

From a8b0a62e88f982e64f2f2744575549b02eabdd05 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 19 Jul 2023 13:40:52 +0200
Subject: [PATCH 38/49] require an output location to be provided externally to
 eigsolveQuda in order to allow all requested eigenvalues to be returned, if
 desired

---
 phmc.c           | 54 +++++++++++++++++++++++++-----------------------
 quda_interface.c | 43 ++++++++++----------------------------
 quda_interface.h | 10 ++++-----
 3 files changed, 44 insertions(+), 63 deletions(-)

diff --git a/phmc.c b/phmc.c
index d4c168eac..550835513 100644
--- a/phmc.c
+++ b/phmc.c
@@ -211,7 +211,9 @@ void init_phmc() {
 void phmc_compute_ev(const int trajectory_counter,
 		     const int id,
 		     matrix_mult_bi Qsq) {
-  double atime, etime, temp=0., temp2=0.;
+  double atime, etime;
+  double eval_min = 0.;
+  double eval_max = 0.;
   int max_iter_ev, no_eigenvalues;
   char buf[100];
   char * phmcfilename = buf;
@@ -230,15 +232,15 @@ void phmc_compute_ev(const int trajectory_counter,
   no_eigenvalues = 1;
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
   #ifdef TM_USE_QUDA
-    temp = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0,
-                        mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
-                        mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
-                                                       1, // we only support even-odd here
-                                                       mnl->solver_params.refinement_precision,
-                                                       mnl->solver_params.sloppy_precision,
-                                                       mnl->solver_params.compression_type, 0);
+    eigsolveQuda(&eval_min, no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 0,
+                 mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                 mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
+                 1, // we only support even-odd here
+                 mnl->solver_params.refinement_precision,
+                 mnl->solver_params.sloppy_precision,
+                 mnl->solver_params.compression_type, 0);
     if( fabs(mnl->EVMax - 1) < 2*DBL_EPSILON ) {
-      temp = temp / mnl->StildeMax;
+      eval_min /= mnl->StildeMax;
     }
   #else
     if(g_proc_id == 0) {
@@ -250,22 +252,22 @@ void phmc_compute_ev(const int trajectory_counter,
     }
   #endif
   }else {
-    temp = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq);
+    eval_min = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq);
   }
   
   
   no_eigenvalues = 1;
   if(mnl->external_eigsolver == QUDA_EIGSOLVER) {
   #ifdef TM_USE_QUDA
-    temp2 = eigsolveQuda(no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1,
-                         mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
-                         mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
-                                                       1, // we only support even-odd here
-                                                       mnl->solver_params.refinement_precision,
-                                                       mnl->solver_params.sloppy_precision,
-                                                       mnl->solver_params.compression_type, 0);
+    eigsolveQuda(&eval_max, no_eigenvalues, eigenvalue_precision, 1, 0, max_iter_ev, 1,
+                 mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                 mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
+                 1, // we only support even-odd here
+                 mnl->solver_params.refinement_precision,
+                 mnl->solver_params.sloppy_precision,
+                 mnl->solver_params.compression_type, 0);
     if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
-      temp2 = temp2 / mnl->StildeMax;
+      eval_max /= mnl->StildeMax;
     }
   #else
     if(g_proc_id == 0) {
@@ -277,26 +279,26 @@ void phmc_compute_ev(const int trajectory_counter,
     }
   #endif
   }else {
-    temp2 = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq);
+    eval_max = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq);
   }
   
   
   if((g_proc_id == 0) && (g_debug_level > 1)) {
     printf("# %s: lowest eigenvalue end of trajectory %d = %e\n", 
-	   mnl->name, trajectory_counter, temp);
+	   mnl->name, trajectory_counter, eval_min);
     printf("# %s: maximal eigenvalue end of trajectory %d = %e\n", 
-	   mnl->name, trajectory_counter, temp2);
+	   mnl->name, trajectory_counter, eval_max);
   }
   if(g_proc_id == 0) {
-    if(temp2 > mnl->EVMax) {
-      fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s larger than upper bound!\n\n", mnl->name);
+    if(eval_max > mnl->EVMax) {
+      fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s: %.6f is larger than upper bound: %.6f\n\n", mnl->name, eval_max, mnl->EVMax);
     }
-    if(temp < mnl->EVMin) {
-      fprintf(stderr, "\nWarning: smallest eigenvalue for monomial %s smaller than lower bound!\n\n", mnl->name);
+    if(eval_min < mnl->EVMin) {
+      fprintf(stderr, "\nWarning: smallest eigenvalue for monomial %s: %.6f is smaller than lower bound: %.6f\n\n", mnl->name, eval_min, mnl->EVMin);
     }
     countfile = fopen(phmcfilename, "a");
     fprintf(countfile, "%.8d %1.5e %1.5e %1.5e %1.5e\n", 
-	    trajectory_counter, temp, temp2, mnl->EVMin, mnl->EVMax);
+	    trajectory_counter, eval_min, eval_max, mnl->EVMin, mnl->EVMax);
     fclose(countfile);
   }
   etime = gettime();
diff --git a/quda_interface.c b/quda_interface.c
index 6a157ce2b..0856b695c 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2611,11 +2611,11 @@ Interface function for Eigensolver on Quda
 *********************************************************/
 
 
-double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
-                    const double precision, const int max_iter, const int polydeg, const double amin, 
-                    const double amax, const int n_kr, const int solver_flag, const int rel_prec,
-                    const int even_odd_flag, const SloppyPrecision refinement_precision,
-                    SloppyPrecision sloppy_precision, CompressionType compression, const int oneFlavourFlag) {
+void eigsolveQuda(double * evals, int n_evals, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
+                  const double precision, const int max_iter, const int polydeg, const double amin, 
+                  const double amax, const int n_kr, const int solver_flag, const int rel_prec,
+                  const int even_odd_flag, const SloppyPrecision refinement_precision,
+                  SloppyPrecision sloppy_precision, CompressionType compression, const int oneFlavourFlag) {
 
   tm_stopwatch_push(&g_timers, __func__, "");
   
@@ -2648,17 +2648,6 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
                               1 /*always QpQm*/);
   }
 
-  // QUDA applies the MMdag operator, we need QpQm^{-1) in the end
-  // so we want QUDA to use the MdagM operator
-  inv_param.dagger = QUDA_DAG_YES;
-
-
-  _Complex double * eigenvls;
-  double returnvalue;
-
-  // allocate memory for eigenvalues
-  eigenvls = (_Complex double *)malloc((n)*sizeof(_Complex double));
-
   // create new eig_param
   eig_param = newQudaEigParam();
   
@@ -2675,6 +2664,8 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
   eig_param.invert_param->cuda_prec_eigensolver = inv_param.cuda_prec;
   eig_param.invert_param->clover_cuda_prec_eigensolver = inv_param.clover_cuda_prec;
   
+  // for consistency with tmLQCD's own eigensolver we require a precision of at least
+  // 1e-14
   if(tol < 1.e-14) {
     eig_param.tol = 1.e-14;
     eig_param.qr_tol = 1.e-14;
@@ -2683,8 +2674,6 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
     eig_param.qr_tol = tol;
   }
   
-
-    
   if(blkwise == 1) {
     eig_param.eig_type = QUDA_EIG_BLK_TR_LANCZOS;
     eig_param.block_size = blksize;
@@ -2707,10 +2696,6 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
     eig_param.use_norm_op = QUDA_BOOLEAN_FALSE;
   }
 
-  // BK: these defaults seem to work on a 32c64 ensemble
-  // at a relatively coarse lattice spacing for the eigenvalues
-  // of the twisted-clover ND operator with values of musigma / mudelta
-  // reproducing physical sea strange and charm quark masses
   eig_param.use_poly_acc = (maxmin == 1) || (polydeg == 0) ? QUDA_BOOLEAN_FALSE : QUDA_BOOLEAN_TRUE;
   eig_param.poly_deg = polydeg;
   eig_param.a_min = amin;
@@ -2745,9 +2730,9 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
 
   /* The size of eigenvector search space and
    * the number of required converged eigenvectors 
-   * is both set to n */
-  eig_param.n_conv = n;
-  eig_param.n_ev = n;
+   * is both set to n_evals */
+  eig_param.n_conv = n_evals;
+  eig_param.n_ev = n_evals;
   /* The size of the Krylov space is set to 96.
    * From my understanding, QUDA automatically scales
    * this search space, however more testing on this 
@@ -2756,13 +2741,7 @@ double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterati
 
   eig_param.max_restarts = max_iterations;
 
-  eigensolveQuda(NULL, eigenvls, &eig_param);
-
-  returnvalue = eigenvls[0];
-  free(eigenvls);
+  eigensolveQuda(NULL, evals, &eig_param);
 
   tm_stopwatch_pop(&g_timers, 0, 1, "TM_QUDA");
-
-  return(returnvalue);
-
 }
diff --git a/quda_interface.h b/quda_interface.h
index eb0ee4447..d133f54d6 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -175,10 +175,10 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 
-double eigsolveQuda(int n, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
-                    const double precision, const int max_iter, const int polydeg, const double amin, 
-                    const double amax, const int n_kr, const int solver_flag, const int rel_prec,
-                    const int even_odd_flag, const SloppyPrecision refinement_precision,
-                    SloppyPrecision sloppy_precision, CompressionType compression, const int oneFlavourFlag);
+void eigsolveQuda(double * evals, int n_evals, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
+                  const double precision, const int max_iter, const int polydeg, const double amin, 
+                  const double amax, const int n_kr, const int solver_flag, const int rel_prec,
+                  const int even_odd_flag, const SloppyPrecision refinement_precision,
+                  SloppyPrecision sloppy_precision, CompressionType compression, const int oneFlavourFlag);
 
 #endif /* QUDA_INTERFACE_H_ */

From 0626f45019d79775bfadd4c9ab29ba2723f9d43f Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 19 Jul 2023 19:38:12 +0200
Subject: [PATCH 39/49] slight modification of verbose readinput output for
 eigensolver input parameters

---
 read_input.l | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/read_input.l b/read_input.l
index 54fe91b8a..da6143e9a 100644
--- a/read_input.l
+++ b/read_input.l
@@ -2610,17 +2610,17 @@ static inline double fltlist_next_token(int * const list_end){
   {SPC}*EigSolverPolyMin{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->eig_amin = c;
-    if(myverbose!=0) printf("  min for polynomial acceleration in eigensolver set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  Minimum eigenvalue to exclude using polynomial acceleration in eigensolver set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
   }
   {SPC}*EigSolverPolyMax{EQL}{FLT} {
     sscanf(yytext, " %[a-zA-Z] = %lf", name, &c);
     mnl->eig_amax = c;
-    if(myverbose!=0) printf("  max for polynomial acceleration in eigensolver set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  Maximum eigenvalues to exclude using polynomial acceleration in eigensolver set to %e line %d monomial %d\n", c, line_of_file, current_monomial);
   }
   {SPC}*EigSolverPolynomialDegree{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);
     mnl->eig_polydeg = a;
-    if(myverbose!=0) printf("  degree of polynomial acceleration in eigensolver set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
+    if(myverbose!=0) printf("  Degree of polynomial acceleration in eigensolver set to %d line %d monomial %d\n", a, line_of_file, current_monomial);
   }
   {SPC}*EigSolverKrylovSubspaceSize{EQL}{DIGIT}+ {
     sscanf(yytext, " %[a-zA-Z] = %d", name, &a);

From c5e8a138af09c53c63f04156935f048d84f39880 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 19 Jul 2023 19:48:22 +0200
Subject: [PATCH 40/49] the eigenvalues computed by QUDA are complex numbers,
 of course

---
 phmc.c           | 18 +++++++++---------
 quda_interface.c |  2 +-
 quda_interface.h |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/phmc.c b/phmc.c
index 550835513..a0f072e80 100644
--- a/phmc.c
+++ b/phmc.c
@@ -212,8 +212,8 @@ void phmc_compute_ev(const int trajectory_counter,
 		     const int id,
 		     matrix_mult_bi Qsq) {
   double atime, etime;
-  double eval_min = 0.;
-  double eval_max = 0.;
+  _Complex double eval_min = 0.0;
+  _Complex double eval_max = 0.0;
   int max_iter_ev, no_eigenvalues;
   char buf[100];
   char * phmcfilename = buf;
@@ -285,20 +285,20 @@ void phmc_compute_ev(const int trajectory_counter,
   
   if((g_proc_id == 0) && (g_debug_level > 1)) {
     printf("# %s: lowest eigenvalue end of trajectory %d = %e\n", 
-	   mnl->name, trajectory_counter, eval_min);
+	   mnl->name, trajectory_counter, creal(eval_min));
     printf("# %s: maximal eigenvalue end of trajectory %d = %e\n", 
-	   mnl->name, trajectory_counter, eval_max);
+	   mnl->name, trajectory_counter, creal(eval_max));
   }
   if(g_proc_id == 0) {
-    if(eval_max > mnl->EVMax) {
-      fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s: %.6f is larger than upper bound: %.6f\n\n", mnl->name, eval_max, mnl->EVMax);
+    if(creal(eval_max) > mnl->EVMax) {
+      fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s: %.6f is larger than upper bound: %.6f\n\n", mnl->name, creal(eval_max), mnl->EVMax);
     }
-    if(eval_min < mnl->EVMin) {
-      fprintf(stderr, "\nWarning: smallest eigenvalue for monomial %s: %.6f is smaller than lower bound: %.6f\n\n", mnl->name, eval_min, mnl->EVMin);
+    if(creal(eval_min) < mnl->EVMin) {
+      fprintf(stderr, "\nWarning: smallest eigenvalue for monomial %s: %.6f is smaller than lower bound: %.6f\n\n", mnl->name, creal(eval_min), mnl->EVMin);
     }
     countfile = fopen(phmcfilename, "a");
     fprintf(countfile, "%.8d %1.5e %1.5e %1.5e %1.5e\n", 
-	    trajectory_counter, eval_min, eval_max, mnl->EVMin, mnl->EVMax);
+	    trajectory_counter, creal(eval_min), creal(eval_max), mnl->EVMin, mnl->EVMax);
     fclose(countfile);
   }
   etime = gettime();
diff --git a/quda_interface.c b/quda_interface.c
index 0856b695c..2438ffd38 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2611,7 +2611,7 @@ Interface function for Eigensolver on Quda
 *********************************************************/
 
 
-void eigsolveQuda(double * evals, int n_evals, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
+void eigsolveQuda(_Complex double * evals, int n_evals, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
                   const double precision, const int max_iter, const int polydeg, const double amin, 
                   const double amax, const int n_kr, const int solver_flag, const int rel_prec,
                   const int even_odd_flag, const SloppyPrecision refinement_precision,
diff --git a/quda_interface.h b/quda_interface.h
index d133f54d6..05fc5444a 100644
--- a/quda_interface.h
+++ b/quda_interface.h
@@ -175,7 +175,7 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
 void compute_WFlow_quda(const double eps ,const double tmax, const int traj, FILE* outfile);
 
 
-void eigsolveQuda(double * evals, int n_evals, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
+void eigsolveQuda(_Complex double * evals, int n_evals, double tol, int blksize, int blkwise, int max_iterations, int maxmin,
                   const double precision, const int max_iter, const int polydeg, const double amin, 
                   const double amax, const int n_kr, const int solver_flag, const int rel_prec,
                   const int even_odd_flag, const SloppyPrecision refinement_precision,

From 4fe78b707ba986eeb178e8c5ba308cb87f479fe0 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 19 Jul 2023 19:48:40 +0200
Subject: [PATCH 41/49] document QUDA eigensolver for the HMC

---
 doc/quda.tex | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/quda.tex b/doc/quda.tex
index 81b7b6b83..b034b1a16 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -444,3 +444,18 @@ \subsubsection{QUDA-MG interface}
 In other words, if the largest of these smallest eigenvalues is $4\cdot10^{-3}$, for example, then \texttt{MGEigSolverPolyMin} can be set to 0.01.
 This ensures that the desired (smallest) part of the spectrum is smaller than \texttt{MGEigSolverPolyMin} and that the entire spectrum is contained in the range up to \texttt{MGEigSolverPolyMax}.
 After this, polynomial acceleration can be enabled, which should reduce setup time significantly.
+
+\subsubsection{Using the QUDA eigensolver in the HMC}
+
+When employing the rational approximation, in order to make sure that the eigenvalue bounds are chosen appropriately, it is necessary to measure the maximal and minimal eigenvalues of the operator involved in the given monomial.
+For the monomials \texttt{NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR}, this can be done using QUDA's eigensolver when, in addition to a non-zero setting for \texttt{ComputeEVFreq}, \texttt{UseExternalEigSolver = quda} is set.
+
+The eigensolver further offers the following parameters:
+\begin{itemize}
+  \item{ \texttt{EigSolverPolynomialDegree}: Once appropriate parameters for the polynomial filter have been determined (see \texttt{EigSolverPolyMin} and \texttt{EigSolverPolyMax} below), when \texttt{EigSolverPolynomialDegree} is set to a non-zero value, polynomial acceleration will be used in the measurent of the smallest eigenvalue. (integer, default: \texttt{128}) }
+  \item{ \texttt{EigSolverPolyMin}: Smallest eigenvalue to be excluded by the polynomial filter when polynomial acceleration is used. A good value for this should be determined by first running the eigensolver without acceleration (\texttt{EigSolverPolynomialDegree = 0}). \texttt{EigSolverPolyMin} should then be set to about $3\lambda_\mathrm{min}$. Note that this is specified in the operator normalisation, such that $\lambda_\mathrm{min}$ obtained from the measurement should be multiplied by \texttt{StildeMax} to get an appropriate value for \texttt{EigSolverPolyMin}.  (positive real number, default: \texttt{0.001})}
+  \item{ \texttt{EigSolverPolyMax}: Largest eigenvalue to be excluded by the polynomial filter when polynomial acceleration is used. This should be set to a value in excess of the measured largest eigenvalue, $1.5\lambda_\mathrm{max}$, say. Note that this is specified in the operator normalisation such that the measured $\lambda_\mathrm{max}$ should be multiplied by \texttt{StildeMax} to obtain an appropriate value for \texttt{EigSolverPolyMax}. (positive real number, defaullt: \texttt{4.0})}
+  \item{ \texttt{EigSolverKrylovSubspaceSize}: Size of the Krylov space used for the determination of the smallest and largest eigenvalues. The default seems to work well even for large lattices. (integer, default: \texttt{96})}
+\end{itemize}
+
+

From a46c4c654b3275ecf06f73448b2b0612d3375c9d Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 2 Aug 2023 13:28:44 +0200
Subject: [PATCH 42/49] script for testing eigsolveQuda

---
 test_eigsolveQuda.c | 558 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 558 insertions(+)
 create mode 100644 test_eigsolveQuda.c

diff --git a/test_eigsolveQuda.c b/test_eigsolveQuda.c
new file mode 100644
index 000000000..e7baf3090
--- /dev/null
+++ b/test_eigsolveQuda.c
@@ -0,0 +1,558 @@
+#include "lime.h"
+#if HAVE_CONFIG_H
+#include<tmlqcd_config.h>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <sys/time.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <float.h>
+#ifdef TM_USE_MPI
+#include <mpi.h>
+#endif
+#ifdef TM_USE_OMP
+#include <omp.h>
+#endif
+#include "global.h"
+#include "git_hash.h"
+#include "io/params.h"
+#include "io/gauge.h"
+#include "getopt.h"
+#include "ranlxd.h"
+#include "geometry_eo.h"
+#include "start.h"
+#include "measure_gauge_action.h"
+#include "measure_rectangles.h"
+#ifdef TM_USE_MPI
+#include "xchange/xchange.h"
+#endif
+#include "read_input.h"
+#include "mpi_init.h"
+#include "sighandler.h"
+#include "update_tm.h"
+#include "init/init.h"
+#include "test/check_geometry.h"
+#include "boundary.h"
+#include "phmc.h"
+#include "solver/solver.h"
+#include "monomial/monomial.h"
+#include "integrator.h"
+#include "sighandler.h"
+#include "meas/measurements.h"
+#include "operator/tm_operators_nd.h"
+#ifdef DDalphaAMG
+#include "DDalphaAMG_interface.h"
+#endif
+#ifdef TM_USE_QUDA
+#  include "quda_interface.h"
+#endif
+
+extern int nstore;
+
+int const rlxdsize = 105;
+
+static void usage(const tm_ExitCode_t exit_code);
+static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
+static void set_default_filenames(char ** input_filename, char ** filename);
+
+int main(int argc,char *argv[]) {
+
+  FILE *parameterfile=NULL, *countfile=NULL;
+  char *filename = NULL;
+  char datafilename[206];
+  char parameterfilename[206];
+  char gauge_filename[50];
+  char nstore_filename[50];
+  char tmp_filename[50];
+  char *input_filename = NULL;
+  int status = 0, accept = 0;
+  int j,ix,mu, trajectory_counter=0;
+  unsigned int const io_max_attempts = 5; /* Make this configurable? */
+  unsigned int const io_timeout = 5; /* Make this configurable? */
+  struct timeval t1;
+
+  _Complex double eval_min = 0.0;
+  _Complex double eval_max = 0.0;
+
+  /* Energy corresponding to the Gauge part */
+  double plaquette_energy = 0., rectangle_energy = 0.;
+    /* Acceptance rate */
+  int Rate=0;
+  /* Do we want to perform reversibility checks */
+  /* See also return_check_flag in read_input.h */
+  int return_check = 0;
+
+  paramsXlfInfo *xlfInfo;
+
+/* For online measurements */
+  measurement * meas;
+  int imeas;
+
+  init_critical_globals(TM_PROGRAM_HMC_TM);
+
+#ifdef _KOJAK_INST
+#pragma pomp inst init
+#pragma pomp inst begin(main)
+#endif
+
+  #if (defined SSE || defined SSE2 || SSE3)
+  signal(SIGILL,&catch_ill_inst);
+#endif
+
+  strcpy(gauge_filename,"conf.save");
+  strcpy(nstore_filename,"nstore_counter");
+  strcpy(tmp_filename, ".conf.tmp");
+
+  verbose = 1;
+  g_use_clover_flag = 0;
+
+  process_args(argc,argv,&input_filename,&filename);
+  set_default_filenames(&input_filename,&filename);
+
+  init_parallel_and_read_input(argc, argv, input_filename);
+
+  DUM_DERI = 4;
+  DUM_MATRIX = DUM_DERI+7;
+  if(g_running_phmc) {
+    NO_OF_SPINORFIELDS = DUM_MATRIX+8;
+  }
+  else {
+    NO_OF_SPINORFIELDS = DUM_MATRIX+6;
+  }
+  DUM_BI_DERI = 6;
+  DUM_BI_SOLVER = DUM_BI_DERI+7;
+
+  DUM_BI_MATRIX = DUM_BI_SOLVER+6;
+  NO_OF_BISPINORFIELDS = DUM_BI_MATRIX+6;
+  
+  //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult.
+  NO_OF_SPINORFIELDS_32 = 6;
+  
+  tmlqcd_mpi_init(argc, argv);
+  tm_stopwatch_push(&g_timers, "HMC", "");
+
+  if(nstore == -1) {
+    countfile = fopen(nstore_filename, "r");
+    if(countfile != NULL) {
+      j = fscanf(countfile, "%d %d %s\n", &nstore, &trajectory_counter, gauge_input_filename);
+      if(j < 1) nstore = 0;
+      if(j < 2) trajectory_counter = 0;
+      fclose(countfile);
+    }
+    else {
+      nstore = 0;
+      trajectory_counter = 0;
+    }
+  }
+  
+#ifndef TM_USE_MPI
+  g_dbw2rand = 0;
+#endif
+  
+  
+  g_mu = g_mu1;
+  
+#ifdef _GAUGE_COPY
+  status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
+  status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
+#else
+  status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
+  status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 0);   
+#endif
+  /* need temporary gauge field for gauge reread checks and in update_tm */
+  status += init_gauge_tmp(VOLUME);
+
+  status += init_gauge_fg(VOLUME);
+
+  if (status != 0) {
+    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
+    exit(0);
+  }
+  j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n");
+    exit(0);
+  }
+  if(even_odd_flag) {
+    j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS);
+    j += init_spinor_field_32(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS_32);      
+  }
+  else {
+    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
+    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);    
+  }
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
+    exit(0);
+  }
+  if(even_odd_flag) {
+    j = init_csg_field(VOLUMEPLUSRAND/2);
+  }
+  else {
+    j = init_csg_field(VOLUMEPLUSRAND);
+  }
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for csg fields! Aborting...\n");
+    exit(0);
+  }
+  j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand);
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for moment fields! Aborting...\n");
+    exit(0);
+  }
+
+  if(g_running_phmc) {
+    j = init_bispinor_field(VOLUME/2, NO_OF_BISPINORFIELDS);
+    if (j!= 0) {
+      fprintf(stderr, "Not enough memory for bi-spinor fields! Aborting...\n");
+      exit(0);
+    }
+  }
+
+  /* list and initialize measurements*/
+  if(g_proc_id == 0) {
+    printf("\n");
+    for(j = 0; j < no_measurements; j++) {
+      printf("# measurement id %d, type = %d: Frequency %d\n", j, measurement_list[j].type, measurement_list[j].freq);
+    }
+  }
+  init_measurements();
+
+  /*construct the filenames for the observables and the parameters*/
+  strncpy(datafilename,filename,200);  
+  strcat(datafilename,".data");
+  strncpy(parameterfilename,filename,200);  
+  strcat(parameterfilename,".para");
+
+  if(g_proc_id == 0){
+    parameterfile = fopen(parameterfilename, "a");
+    write_first_messages(parameterfile, "hmc", git_hash);
+  }
+
+  /* define the geometry */
+  geometry();
+
+  /* define the boundary conditions for the fermion fields */
+  boundary(g_kappa);
+
+  status = check_geometry();
+
+  if (status != 0) {
+    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
+    exit(1);
+  }
+
+
+#ifdef _USE_HALFSPINOR
+  j = init_dirac_halfspinor();
+  if (j!= 0) {
+    fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
+    exit(-1);
+  }
+
+  j = init_dirac_halfspinor32();
+  if (j != 0)
+  {
+    fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
+    exit(-1);
+  } 
+  
+#  if (defined _PERSISTENT)
+  init_xchange_halffield();
+#  endif
+#endif
+
+  /* Initialise random number generator */
+  start_ranlux(rlxd_level, random_seed^trajectory_counter);
+
+  /* Set up the gauge field */
+  /* continue and restart */
+  if(startoption==3 || startoption == 2) {
+    if(g_proc_id == 0) {
+      printf("# Trying to read gauge field from file %s in %s precision.\n",
+            gauge_input_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
+      fflush(stdout);
+    }
+    if( (status = read_gauge_field(gauge_input_filename,g_gauge_field)) != 0) {
+      fprintf(stderr, "Error %d while reading gauge field from %s\nAborting...\n", status, gauge_input_filename);
+      exit(-2);
+    }
+
+    if (g_proc_id == 0){
+      printf("# Finished reading gauge field.\n");
+      fflush(stdout);
+    }
+  }
+  else if (startoption == 1) {
+    /* hot */
+    random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);
+  }
+  else if(startoption == 0) {
+    /* cold */
+    unit_g_gauge_field();
+  }
+
+  /*For parallelization: exchange the gaugefield */
+#ifdef TM_USE_MPI
+  xchange_gauge(g_gauge_field);
+  update_tm_gauge_exchange(&g_gauge_state);
+#endif
+    
+  /*Convert to a 32 bit gauge field, after xchange*/
+  convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND + g_dbw2rand);
+#ifdef TM_USE_MPI
+  update_tm_gauge_exchange(&g_gauge_state_32);
+#endif
+  
+    
+  if(even_odd_flag) {
+    j = init_monomials(VOLUMEPLUSRAND/2, even_odd_flag);
+  }
+  else {
+    j = init_monomials(VOLUMEPLUSRAND, even_odd_flag);
+  }
+  if (j != 0) {
+    fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n");
+    exit(0);
+  }
+
+  init_integrator();
+
+  if(g_proc_id == 0) {
+    for(j = 0; j < no_monomials; j++) {
+      printf("# monomial id %d type = %d timescale %d\n", j, monomial_list[j].type, monomial_list[j].timescale);
+    }
+  }
+
+  plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);
+  if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) {
+    rectangle_energy = measure_rectangles( (const su3**) g_gauge_field);
+    if(g_proc_id == 0){
+      fprintf(parameterfile,"# Computed rectangle value: %14.12f.\n",rectangle_energy/(12.*VOLUME*g_nproc));
+    }
+  }
+  //eneg = g_rgi_C0 * plaquette_energy + g_rgi_C1 * rectangle_energy;
+
+  if(g_proc_id == 0) {
+    fprintf(parameterfile,"# Computed plaquette value: %14.12f.\n", plaquette_energy/(6.*VOLUME*g_nproc));
+    printf("# Computed plaquette value: %14.12f.\n", plaquette_energy/(6.*VOLUME*g_nproc));
+    fclose(parameterfile);
+  }
+
+  /* set ddummy to zero */
+  for(ix = 0; ix < VOLUMEPLUSRAND; ix++){
+    for(mu=0; mu<4; mu++){
+      ddummy[ix][mu].d1=0.;
+      ddummy[ix][mu].d2=0.;
+      ddummy[ix][mu].d3=0.;
+      ddummy[ix][mu].d4=0.;
+      ddummy[ix][mu].d5=0.;
+      ddummy[ix][mu].d6=0.;
+      ddummy[ix][mu].d7=0.;
+      ddummy[ix][mu].d8=0.;
+    }
+  }
+
+
+  for(j = 0; j < no_monomials; j++) {
+    if( (monomial_list[j].type == NDPOLY) || (monomial_list[j].type == NDDETRATIO)
+    || (monomial_list[j].type == NDCLOVER) || (monomial_list[j].type == NDRAT)
+    || (monomial_list[j].type == NDCLOVERRAT) || (monomial_list[j].type == NDRATCOR)
+    || (monomial_list[j].type == NDCLOVERRATCOR) || (monomial_list[j].type == NDCLOVERDETRATIO) ) {
+      if( (monomial_list[j].rec_ev != 0) ) {
+        monomial * mnl = &monomial_list[j];
+#ifdef TM_USE_QUDA
+        eigsolveQuda(&eval_max, 1, eigenvalue_precision, 1, 0, 1000, 1,
+                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
+                     1, // we only support even-odd here
+                     mnl->solver_params.refinement_precision,
+                     mnl->solver_params.sloppy_precision,
+                     mnl->solver_params.compression_type, 0);
+        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
+          eval_max /= mnl->StildeMax;
+        }
+        if(g_proc_id == 0){
+          printf("monomial name: %s , id: %d, maximal eigenvalue = %e\n",mnl->name,j,creal(eval_max));
+        }
+        eigsolveQuda(&eval_min, 1, eigenvalue_precision, 1, 0, 1000, 0,
+                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
+                     1, // we only support even-odd here
+                     mnl->solver_params.refinement_precision,
+                     mnl->solver_params.sloppy_precision,
+                     mnl->solver_params.compression_type, 0);
+        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
+          eval_min /= mnl->StildeMax;
+        }
+        if(g_proc_id == 0){
+          printf("monomial name: %s , id: %d, lowest eigenvalue = %e\n",mnl->name,j,creal(eval_min));
+        }
+#else
+        if(g_proc_id == 0) {
+          fprintf(stderr, "Error: Attempted to use QUDA eigensolver but this build was not configured for QUDA usage.\n");
+        #ifdef TM_USE_MPI
+          MPI_Finalize();
+        #endif
+          exit(-2);
+        }
+#endif
+      }
+    }else if( (monomial_list[j].type == CLOVERTRLOG) || (monomial_list[j].type == CLOVERDET)
+    || (monomial_list[j].type == CLOVERDETRATIO) || (monomial_list[j].type == CLOVERNDTRLOG)
+    || (monomial_list[j].type == CLOVERRAT) || (monomial_list[j].type == CLOVERRATCOR)
+    || (monomial_list[j].type == CLOVERDETRATIORW) || (monomial_list[j].type == POLY)
+    || (monomial_list[j].type == POLYDETRATIO) || (monomial_list[j].type == RAT)
+    || (monomial_list[j].type == RATCOR) ) {
+      if( (monomial_list[j].rec_ev != 0) ) {
+        monomial * mnl = &monomial_list[j];
+#ifdef TM_USE_QUDA
+        eigsolveQuda(&eval_max, 1, eigenvalue_precision, 1, 0, 1000, 1,
+                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
+                     1, // we only support even-odd here
+                     mnl->solver_params.refinement_precision,
+                     mnl->solver_params.sloppy_precision,
+                     mnl->solver_params.compression_type, 1);
+        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
+          eval_max /= mnl->StildeMax;
+        }
+        if(g_proc_id == 0){
+          printf("monomial name: %s , id: %d, maximal eigenvalue = %e\n",mnl->name,j,creal(eval_max));
+        }
+        eigsolveQuda(&eval_min, 1, eigenvalue_precision, 1, 0, 1000, 0,
+                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
+                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
+                     1, // we only support even-odd here
+                     mnl->solver_params.refinement_precision,
+                     mnl->solver_params.sloppy_precision,
+                     mnl->solver_params.compression_type, 1);
+        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
+          eval_min /= mnl->StildeMax;
+        }
+        if(g_proc_id == 0){
+          printf("monomial name: %s , id: %d, lowest eigenvalue = %e\n",mnl->name,j,creal(eval_min));
+        }
+#else
+        if(g_proc_id == 0) {
+          fprintf(stderr, "Error: Attempted to use QUDA eigensolver but this build was not configured for QUDA usage.\n");
+        #ifdef TM_USE_MPI
+          MPI_Finalize();
+        #endif
+          exit(-2);
+        }
+#endif
+      }
+    }
+  }
+
+  #ifdef TM_USE_OMP
+  free_omp_accumulators();
+#endif
+  free_gauge_tmp();
+  free_gauge_field();
+  free_gauge_field_32();  
+  free_geometry_indices();
+  free_spinor_field();
+  free_spinor_field_32();  
+  free_moment_field();
+  free_monomials();
+  if(g_running_phmc) {
+    free_bispinor_field();
+    free_chi_spinor_field();
+  }
+  free(input_filename);
+  free(filename);
+  free(SourceInfo.basename);
+  free(PropInfo.basename);
+
+  tm_stopwatch_pop(&g_timers, 0, 1, "");
+
+#ifdef TM_USE_QUDA
+  _endQuda();
+#endif
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+#endif
+
+  return(0);
+#ifdef _KOJAK_INST
+#pragma pomp inst end(main)
+#endif
+}
+
+static void usage(const tm_ExitCode_t exit_code){
+  if(g_proc_id == 0){
+    fprintf(stdout, "QUDA eigensolver for finding largest and lowest eigenvalues\n");
+    fprintf(stdout, "Use exactly same input as hmc_tm\n");
+    fprintf(stdout, "Set `ComputeEVFreq` to non-zero for the operators for which eigenvalues need to calculated\n");
+    fprintf(stdout, "Version %s \n\n", TMLQCD_PACKAGE_VERSION);
+    fprintf(stdout, "Please send bug reports to %s\n", TMLQCD_PACKAGE_BUGREPORT);
+    fprintf(stdout, "Usage:   hmc_tm [options]\n");
+    fprintf(stdout, "Options: [-f input-filename]  default: hmc.input\n");
+    fprintf(stdout, "         [-o output-filename] default: output\n");
+    fprintf(stdout, "         [-v] more verbosity\n");
+    fprintf(stdout, "         [-V] print version information and exit\n");
+    fprintf(stdout, "         [-m level] request MPI thread level 'single' or 'multiple' (default: 'single')\n");
+    fprintf(stdout, "         [-h|-? this help]\n");
+  }
+  exit(exit_code);
+}
+
+static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) {
+  int c;
+  while ((c = getopt(argc, argv, "h?vVf:o:m:")) != -1) {
+    switch (c) {
+      case 'f':
+        *input_filename = calloc(200, sizeof(char));
+        strncpy(*input_filename, optarg, 200);
+        break;
+      case 'o':
+        *filename = calloc(200, sizeof(char));
+        strncpy(*filename, optarg, 200);
+        break;
+      case 'v':
+        verbose = 1;
+        break;
+      case 'V':
+        if(g_proc_id == 0) {
+          fprintf(stdout,"%s %s\n",TMLQCD_PACKAGE_STRING,git_hash);
+        }
+        exit(TM_EXIT_SUCCESS);
+        break;
+      case 'm':
+        if( !strcmp(optarg, "single") ){
+          g_mpi_thread_level = TM_MPI_THREAD_SINGLE;
+        } else if ( !strcmp(optarg, "multiple") ) {
+          g_mpi_thread_level = TM_MPI_THREAD_MULTIPLE;
+        } else {
+          tm_debug_printf(0, 0, "[hmc_tm process_args]: invalid input for -m command line argument\n");
+          usage(TM_EXIT_INVALID_CMDLINE_ARG);
+        }
+        break;
+      case 'h':
+      case '?':
+      default:
+        usage(TM_EXIT_SUCCESS);
+        break;
+    }
+  }
+}
+
+static void set_default_filenames(char ** input_filename, char ** filename) {
+  if( *input_filename == NULL ) {
+    *input_filename = calloc(13, sizeof(char));
+    strcpy(*input_filename,"hmc.input");
+  }
+  
+  if( *filename == NULL ) {
+    *filename = calloc(7, sizeof(char));
+    strcpy(*filename,"output");
+  } 
+}
+

From 3bb66397238102f43b4cd58eba895eee91c6bd40 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Wed, 4 Oct 2023 13:02:32 +0200
Subject: [PATCH 43/49] Revert "script for testing eigsolveQuda"

This reverts commit a46c4c654b3275ecf06f73448b2b0612d3375c9d.
---
 test_eigsolveQuda.c | 558 --------------------------------------------
 1 file changed, 558 deletions(-)
 delete mode 100644 test_eigsolveQuda.c

diff --git a/test_eigsolveQuda.c b/test_eigsolveQuda.c
deleted file mode 100644
index e7baf3090..000000000
--- a/test_eigsolveQuda.c
+++ /dev/null
@@ -1,558 +0,0 @@
-#include "lime.h"
-#if HAVE_CONFIG_H
-#include<tmlqcd_config.h>
-#endif
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-#include <sys/time.h>
-#include <string.h>
-#include <signal.h>
-#include <unistd.h>
-#include <float.h>
-#ifdef TM_USE_MPI
-#include <mpi.h>
-#endif
-#ifdef TM_USE_OMP
-#include <omp.h>
-#endif
-#include "global.h"
-#include "git_hash.h"
-#include "io/params.h"
-#include "io/gauge.h"
-#include "getopt.h"
-#include "ranlxd.h"
-#include "geometry_eo.h"
-#include "start.h"
-#include "measure_gauge_action.h"
-#include "measure_rectangles.h"
-#ifdef TM_USE_MPI
-#include "xchange/xchange.h"
-#endif
-#include "read_input.h"
-#include "mpi_init.h"
-#include "sighandler.h"
-#include "update_tm.h"
-#include "init/init.h"
-#include "test/check_geometry.h"
-#include "boundary.h"
-#include "phmc.h"
-#include "solver/solver.h"
-#include "monomial/monomial.h"
-#include "integrator.h"
-#include "sighandler.h"
-#include "meas/measurements.h"
-#include "operator/tm_operators_nd.h"
-#ifdef DDalphaAMG
-#include "DDalphaAMG_interface.h"
-#endif
-#ifdef TM_USE_QUDA
-#  include "quda_interface.h"
-#endif
-
-extern int nstore;
-
-int const rlxdsize = 105;
-
-static void usage(const tm_ExitCode_t exit_code);
-static void process_args(int argc, char *argv[], char ** input_filename, char ** filename);
-static void set_default_filenames(char ** input_filename, char ** filename);
-
-int main(int argc,char *argv[]) {
-
-  FILE *parameterfile=NULL, *countfile=NULL;
-  char *filename = NULL;
-  char datafilename[206];
-  char parameterfilename[206];
-  char gauge_filename[50];
-  char nstore_filename[50];
-  char tmp_filename[50];
-  char *input_filename = NULL;
-  int status = 0, accept = 0;
-  int j,ix,mu, trajectory_counter=0;
-  unsigned int const io_max_attempts = 5; /* Make this configurable? */
-  unsigned int const io_timeout = 5; /* Make this configurable? */
-  struct timeval t1;
-
-  _Complex double eval_min = 0.0;
-  _Complex double eval_max = 0.0;
-
-  /* Energy corresponding to the Gauge part */
-  double plaquette_energy = 0., rectangle_energy = 0.;
-    /* Acceptance rate */
-  int Rate=0;
-  /* Do we want to perform reversibility checks */
-  /* See also return_check_flag in read_input.h */
-  int return_check = 0;
-
-  paramsXlfInfo *xlfInfo;
-
-/* For online measurements */
-  measurement * meas;
-  int imeas;
-
-  init_critical_globals(TM_PROGRAM_HMC_TM);
-
-#ifdef _KOJAK_INST
-#pragma pomp inst init
-#pragma pomp inst begin(main)
-#endif
-
-  #if (defined SSE || defined SSE2 || SSE3)
-  signal(SIGILL,&catch_ill_inst);
-#endif
-
-  strcpy(gauge_filename,"conf.save");
-  strcpy(nstore_filename,"nstore_counter");
-  strcpy(tmp_filename, ".conf.tmp");
-
-  verbose = 1;
-  g_use_clover_flag = 0;
-
-  process_args(argc,argv,&input_filename,&filename);
-  set_default_filenames(&input_filename,&filename);
-
-  init_parallel_and_read_input(argc, argv, input_filename);
-
-  DUM_DERI = 4;
-  DUM_MATRIX = DUM_DERI+7;
-  if(g_running_phmc) {
-    NO_OF_SPINORFIELDS = DUM_MATRIX+8;
-  }
-  else {
-    NO_OF_SPINORFIELDS = DUM_MATRIX+6;
-  }
-  DUM_BI_DERI = 6;
-  DUM_BI_SOLVER = DUM_BI_DERI+7;
-
-  DUM_BI_MATRIX = DUM_BI_SOLVER+6;
-  NO_OF_BISPINORFIELDS = DUM_BI_MATRIX+6;
-  
-  //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult.
-  NO_OF_SPINORFIELDS_32 = 6;
-  
-  tmlqcd_mpi_init(argc, argv);
-  tm_stopwatch_push(&g_timers, "HMC", "");
-
-  if(nstore == -1) {
-    countfile = fopen(nstore_filename, "r");
-    if(countfile != NULL) {
-      j = fscanf(countfile, "%d %d %s\n", &nstore, &trajectory_counter, gauge_input_filename);
-      if(j < 1) nstore = 0;
-      if(j < 2) trajectory_counter = 0;
-      fclose(countfile);
-    }
-    else {
-      nstore = 0;
-      trajectory_counter = 0;
-    }
-  }
-  
-#ifndef TM_USE_MPI
-  g_dbw2rand = 0;
-#endif
-  
-  
-  g_mu = g_mu1;
-  
-#ifdef _GAUGE_COPY
-  status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1);
-  status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1);
-#else
-  status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0);
-  status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 0);   
-#endif
-  /* need temporary gauge field for gauge reread checks and in update_tm */
-  status += init_gauge_tmp(VOLUME);
-
-  status += init_gauge_fg(VOLUME);
-
-  if (status != 0) {
-    fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n");
-    exit(0);
-  }
-  j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand);
-  if (j != 0) {
-    fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n");
-    exit(0);
-  }
-  if(even_odd_flag) {
-    j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS);
-    j += init_spinor_field_32(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS_32);      
-  }
-  else {
-    j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS);
-    j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32);    
-  }
-  if (j != 0) {
-    fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n");
-    exit(0);
-  }
-  if(even_odd_flag) {
-    j = init_csg_field(VOLUMEPLUSRAND/2);
-  }
-  else {
-    j = init_csg_field(VOLUMEPLUSRAND);
-  }
-  if (j != 0) {
-    fprintf(stderr, "Not enough memory for csg fields! Aborting...\n");
-    exit(0);
-  }
-  j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand);
-  if (j != 0) {
-    fprintf(stderr, "Not enough memory for moment fields! Aborting...\n");
-    exit(0);
-  }
-
-  if(g_running_phmc) {
-    j = init_bispinor_field(VOLUME/2, NO_OF_BISPINORFIELDS);
-    if (j!= 0) {
-      fprintf(stderr, "Not enough memory for bi-spinor fields! Aborting...\n");
-      exit(0);
-    }
-  }
-
-  /* list and initialize measurements*/
-  if(g_proc_id == 0) {
-    printf("\n");
-    for(j = 0; j < no_measurements; j++) {
-      printf("# measurement id %d, type = %d: Frequency %d\n", j, measurement_list[j].type, measurement_list[j].freq);
-    }
-  }
-  init_measurements();
-
-  /*construct the filenames for the observables and the parameters*/
-  strncpy(datafilename,filename,200);  
-  strcat(datafilename,".data");
-  strncpy(parameterfilename,filename,200);  
-  strcat(parameterfilename,".para");
-
-  if(g_proc_id == 0){
-    parameterfile = fopen(parameterfilename, "a");
-    write_first_messages(parameterfile, "hmc", git_hash);
-  }
-
-  /* define the geometry */
-  geometry();
-
-  /* define the boundary conditions for the fermion fields */
-  boundary(g_kappa);
-
-  status = check_geometry();
-
-  if (status != 0) {
-    fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n");
-    exit(1);
-  }
-
-
-#ifdef _USE_HALFSPINOR
-  j = init_dirac_halfspinor();
-  if (j!= 0) {
-    fprintf(stderr, "Not enough memory for halffield! Aborting...\n");
-    exit(-1);
-  }
-
-  j = init_dirac_halfspinor32();
-  if (j != 0)
-  {
-    fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n");
-    exit(-1);
-  } 
-  
-#  if (defined _PERSISTENT)
-  init_xchange_halffield();
-#  endif
-#endif
-
-  /* Initialise random number generator */
-  start_ranlux(rlxd_level, random_seed^trajectory_counter);
-
-  /* Set up the gauge field */
-  /* continue and restart */
-  if(startoption==3 || startoption == 2) {
-    if(g_proc_id == 0) {
-      printf("# Trying to read gauge field from file %s in %s precision.\n",
-            gauge_input_filename, (gauge_precision_read_flag == 32 ? "single" : "double"));
-      fflush(stdout);
-    }
-    if( (status = read_gauge_field(gauge_input_filename,g_gauge_field)) != 0) {
-      fprintf(stderr, "Error %d while reading gauge field from %s\nAborting...\n", status, gauge_input_filename);
-      exit(-2);
-    }
-
-    if (g_proc_id == 0){
-      printf("# Finished reading gauge field.\n");
-      fflush(stdout);
-    }
-  }
-  else if (startoption == 1) {
-    /* hot */
-    random_gauge_field(reproduce_randomnumber_flag, g_gauge_field);
-  }
-  else if(startoption == 0) {
-    /* cold */
-    unit_g_gauge_field();
-  }
-
-  /*For parallelization: exchange the gaugefield */
-#ifdef TM_USE_MPI
-  xchange_gauge(g_gauge_field);
-  update_tm_gauge_exchange(&g_gauge_state);
-#endif
-    
-  /*Convert to a 32 bit gauge field, after xchange*/
-  convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND + g_dbw2rand);
-#ifdef TM_USE_MPI
-  update_tm_gauge_exchange(&g_gauge_state_32);
-#endif
-  
-    
-  if(even_odd_flag) {
-    j = init_monomials(VOLUMEPLUSRAND/2, even_odd_flag);
-  }
-  else {
-    j = init_monomials(VOLUMEPLUSRAND, even_odd_flag);
-  }
-  if (j != 0) {
-    fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n");
-    exit(0);
-  }
-
-  init_integrator();
-
-  if(g_proc_id == 0) {
-    for(j = 0; j < no_monomials; j++) {
-      printf("# monomial id %d type = %d timescale %d\n", j, monomial_list[j].type, monomial_list[j].timescale);
-    }
-  }
-
-  plaquette_energy = measure_plaquette( (const su3**) g_gauge_field);
-  if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) {
-    rectangle_energy = measure_rectangles( (const su3**) g_gauge_field);
-    if(g_proc_id == 0){
-      fprintf(parameterfile,"# Computed rectangle value: %14.12f.\n",rectangle_energy/(12.*VOLUME*g_nproc));
-    }
-  }
-  //eneg = g_rgi_C0 * plaquette_energy + g_rgi_C1 * rectangle_energy;
-
-  if(g_proc_id == 0) {
-    fprintf(parameterfile,"# Computed plaquette value: %14.12f.\n", plaquette_energy/(6.*VOLUME*g_nproc));
-    printf("# Computed plaquette value: %14.12f.\n", plaquette_energy/(6.*VOLUME*g_nproc));
-    fclose(parameterfile);
-  }
-
-  /* set ddummy to zero */
-  for(ix = 0; ix < VOLUMEPLUSRAND; ix++){
-    for(mu=0; mu<4; mu++){
-      ddummy[ix][mu].d1=0.;
-      ddummy[ix][mu].d2=0.;
-      ddummy[ix][mu].d3=0.;
-      ddummy[ix][mu].d4=0.;
-      ddummy[ix][mu].d5=0.;
-      ddummy[ix][mu].d6=0.;
-      ddummy[ix][mu].d7=0.;
-      ddummy[ix][mu].d8=0.;
-    }
-  }
-
-
-  for(j = 0; j < no_monomials; j++) {
-    if( (monomial_list[j].type == NDPOLY) || (monomial_list[j].type == NDDETRATIO)
-    || (monomial_list[j].type == NDCLOVER) || (monomial_list[j].type == NDRAT)
-    || (monomial_list[j].type == NDCLOVERRAT) || (monomial_list[j].type == NDRATCOR)
-    || (monomial_list[j].type == NDCLOVERRATCOR) || (monomial_list[j].type == NDCLOVERDETRATIO) ) {
-      if( (monomial_list[j].rec_ev != 0) ) {
-        monomial * mnl = &monomial_list[j];
-#ifdef TM_USE_QUDA
-        eigsolveQuda(&eval_max, 1, eigenvalue_precision, 1, 0, 1000, 1,
-                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
-                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
-                     1, // we only support even-odd here
-                     mnl->solver_params.refinement_precision,
-                     mnl->solver_params.sloppy_precision,
-                     mnl->solver_params.compression_type, 0);
-        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
-          eval_max /= mnl->StildeMax;
-        }
-        if(g_proc_id == 0){
-          printf("monomial name: %s , id: %d, maximal eigenvalue = %e\n",mnl->name,j,creal(eval_max));
-        }
-        eigsolveQuda(&eval_min, 1, eigenvalue_precision, 1, 0, 1000, 0,
-                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
-                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
-                     1, // we only support even-odd here
-                     mnl->solver_params.refinement_precision,
-                     mnl->solver_params.sloppy_precision,
-                     mnl->solver_params.compression_type, 0);
-        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
-          eval_min /= mnl->StildeMax;
-        }
-        if(g_proc_id == 0){
-          printf("monomial name: %s , id: %d, lowest eigenvalue = %e\n",mnl->name,j,creal(eval_min));
-        }
-#else
-        if(g_proc_id == 0) {
-          fprintf(stderr, "Error: Attempted to use QUDA eigensolver but this build was not configured for QUDA usage.\n");
-        #ifdef TM_USE_MPI
-          MPI_Finalize();
-        #endif
-          exit(-2);
-        }
-#endif
-      }
-    }else if( (monomial_list[j].type == CLOVERTRLOG) || (monomial_list[j].type == CLOVERDET)
-    || (monomial_list[j].type == CLOVERDETRATIO) || (monomial_list[j].type == CLOVERNDTRLOG)
-    || (monomial_list[j].type == CLOVERRAT) || (monomial_list[j].type == CLOVERRATCOR)
-    || (monomial_list[j].type == CLOVERDETRATIORW) || (monomial_list[j].type == POLY)
-    || (monomial_list[j].type == POLYDETRATIO) || (monomial_list[j].type == RAT)
-    || (monomial_list[j].type == RATCOR) ) {
-      if( (monomial_list[j].rec_ev != 0) ) {
-        monomial * mnl = &monomial_list[j];
-#ifdef TM_USE_QUDA
-        eigsolveQuda(&eval_max, 1, eigenvalue_precision, 1, 0, 1000, 1,
-                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
-                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
-                     1, // we only support even-odd here
-                     mnl->solver_params.refinement_precision,
-                     mnl->solver_params.sloppy_precision,
-                     mnl->solver_params.compression_type, 1);
-        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
-          eval_max /= mnl->StildeMax;
-        }
-        if(g_proc_id == 0){
-          printf("monomial name: %s , id: %d, maximal eigenvalue = %e\n",mnl->name,j,creal(eval_max));
-        }
-        eigsolveQuda(&eval_min, 1, eigenvalue_precision, 1, 0, 1000, 0,
-                     mnl->accprec, mnl->maxiter, mnl->eig_polydeg, mnl->eig_amin, 
-                     mnl->eig_amax, mnl->eig_n_kr, mnl->solver, g_relative_precision_flag,
-                     1, // we only support even-odd here
-                     mnl->solver_params.refinement_precision,
-                     mnl->solver_params.sloppy_precision,
-                     mnl->solver_params.compression_type, 1);
-        if( fabs(mnl->EVMax - 1.) < 2*DBL_EPSILON ) {
-          eval_min /= mnl->StildeMax;
-        }
-        if(g_proc_id == 0){
-          printf("monomial name: %s , id: %d, lowest eigenvalue = %e\n",mnl->name,j,creal(eval_min));
-        }
-#else
-        if(g_proc_id == 0) {
-          fprintf(stderr, "Error: Attempted to use QUDA eigensolver but this build was not configured for QUDA usage.\n");
-        #ifdef TM_USE_MPI
-          MPI_Finalize();
-        #endif
-          exit(-2);
-        }
-#endif
-      }
-    }
-  }
-
-  #ifdef TM_USE_OMP
-  free_omp_accumulators();
-#endif
-  free_gauge_tmp();
-  free_gauge_field();
-  free_gauge_field_32();  
-  free_geometry_indices();
-  free_spinor_field();
-  free_spinor_field_32();  
-  free_moment_field();
-  free_monomials();
-  if(g_running_phmc) {
-    free_bispinor_field();
-    free_chi_spinor_field();
-  }
-  free(input_filename);
-  free(filename);
-  free(SourceInfo.basename);
-  free(PropInfo.basename);
-
-  tm_stopwatch_pop(&g_timers, 0, 1, "");
-
-#ifdef TM_USE_QUDA
-  _endQuda();
-#endif
-#ifdef TM_USE_MPI
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Finalize();
-#endif
-
-  return(0);
-#ifdef _KOJAK_INST
-#pragma pomp inst end(main)
-#endif
-}
-
-static void usage(const tm_ExitCode_t exit_code){
-  if(g_proc_id == 0){
-    fprintf(stdout, "QUDA eigensolver for finding largest and lowest eigenvalues\n");
-    fprintf(stdout, "Use exactly same input as hmc_tm\n");
-    fprintf(stdout, "Set `ComputeEVFreq` to non-zero for the operators for which eigenvalues need to calculated\n");
-    fprintf(stdout, "Version %s \n\n", TMLQCD_PACKAGE_VERSION);
-    fprintf(stdout, "Please send bug reports to %s\n", TMLQCD_PACKAGE_BUGREPORT);
-    fprintf(stdout, "Usage:   hmc_tm [options]\n");
-    fprintf(stdout, "Options: [-f input-filename]  default: hmc.input\n");
-    fprintf(stdout, "         [-o output-filename] default: output\n");
-    fprintf(stdout, "         [-v] more verbosity\n");
-    fprintf(stdout, "         [-V] print version information and exit\n");
-    fprintf(stdout, "         [-m level] request MPI thread level 'single' or 'multiple' (default: 'single')\n");
-    fprintf(stdout, "         [-h|-? this help]\n");
-  }
-  exit(exit_code);
-}
-
-static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) {
-  int c;
-  while ((c = getopt(argc, argv, "h?vVf:o:m:")) != -1) {
-    switch (c) {
-      case 'f':
-        *input_filename = calloc(200, sizeof(char));
-        strncpy(*input_filename, optarg, 200);
-        break;
-      case 'o':
-        *filename = calloc(200, sizeof(char));
-        strncpy(*filename, optarg, 200);
-        break;
-      case 'v':
-        verbose = 1;
-        break;
-      case 'V':
-        if(g_proc_id == 0) {
-          fprintf(stdout,"%s %s\n",TMLQCD_PACKAGE_STRING,git_hash);
-        }
-        exit(TM_EXIT_SUCCESS);
-        break;
-      case 'm':
-        if( !strcmp(optarg, "single") ){
-          g_mpi_thread_level = TM_MPI_THREAD_SINGLE;
-        } else if ( !strcmp(optarg, "multiple") ) {
-          g_mpi_thread_level = TM_MPI_THREAD_MULTIPLE;
-        } else {
-          tm_debug_printf(0, 0, "[hmc_tm process_args]: invalid input for -m command line argument\n");
-          usage(TM_EXIT_INVALID_CMDLINE_ARG);
-        }
-        break;
-      case 'h':
-      case '?':
-      default:
-        usage(TM_EXIT_SUCCESS);
-        break;
-    }
-  }
-}
-
-static void set_default_filenames(char ** input_filename, char ** filename) {
-  if( *input_filename == NULL ) {
-    *input_filename = calloc(13, sizeof(char));
-    strcpy(*input_filename,"hmc.input");
-  }
-  
-  if( *filename == NULL ) {
-    *filename = calloc(7, sizeof(char));
-    strcpy(*filename,"output");
-  } 
-}
-

From def70a32e4e97f67975191632f00d9f02f106ce6 Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 25 Oct 2023 12:03:18 +0200
Subject: [PATCH 44/49] default quda_input.mg_mu_factor is set to
 _default_quda_mg_mu_factor on all levels

---
 read_input.l | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/read_input.l b/read_input.l
index da6143e9a..4bebf2790 100644
--- a/read_input.l
+++ b/read_input.l
@@ -3886,7 +3886,7 @@ int read_input(char * conf_file){
     quda_input.mg_smoother_tol[level] = _default_quda_mg_smoother_tol;
 
     quda_input.mg_n_vec[level] = _default_quda_mg_n_vec;
-    quda_input.mg_mu_factor[level] = 1.0;
+    quda_input.mg_mu_factor[level] = _default_quda_mg_mu_factor;
     quda_input.mg_coarse_solver_type[level] = QUDA_GCR_INVERTER;
     quda_input.mg_smoother_type[level] = QUDA_CA_GCR_INVERTER;
 

From 5a2eba14951e8f1edfbfd3a81e3a6a917c3be64b Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 25 Oct 2023 12:06:33 +0200
Subject: [PATCH 45/49] addition and correction to quda documentation

---
 doc/quda.tex | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/quda.tex b/doc/quda.tex
index b034b1a16..1eab42b36 100644
--- a/doc/quda.tex
+++ b/doc/quda.tex
@@ -121,8 +121,8 @@ \subsubsection{General settings}
 \begin{itemize}
   \item \texttt{FermionBC} Forces twisted ({\ttfamily theta}), periodic ({\ttfamily pbc}) or antiperiodic ({\ttfamily apbc}) temporal quark field boundary conditions irrespective of what has been set for \texttt{ThetaT}. This setting exists because at the time of writing (2017.12.28), there seems to be a bug or incompatibility in QUDA which causes (anti-)periodic boundary conditions with gauge compression to produce incorrect propagators. Use with care as the residual check using tmLQCD operators will suggest a non-converged residual.
   \item \texttt{Pipeline} The pipeline length for fused operations in some solvers (for the tmLQCD QUDA interface, at the time of writing in May 2023, this is just GCR). (positive integer, default: $0$)
-  \item \texttt{gcrNkrylov} 
-  \item \texttt{ReliableDelta}
+  \item \texttt{gcrNkrylov} Maximum size of Krylov space used by the solver. (positive integer, default: 10)
+  \item \texttt{ReliableDelta} Reliable update tolerance. (positive float, default: 0.001)
 \end{itemize}
 
 \subsubsection{More advanced settings}
@@ -251,7 +251,7 @@ \subsubsection{QUDA-MG interface}
   \item{ \texttt{MGSmootherPreIterations}: number of smoothing steps before coarse grid correction on a per-level basis. (comma-separated list of zero or positive integers, default: $0$ on all levels)}
   \item{ \texttt{MGSmootherPostIterations}: number of smoothing steps after prolongation on a per-level basis. (comma-separated list of zero or positive integers, default: $4$ on all levels)}
   \item{ \texttt{MGOverUnderRelaxationFactor}: Over- or under-relaxation factor on a per-level basis. (comma-separated list of positive floats, default: $0.85$ on all levels)}
-  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition numer of coarse grid solves. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but sometimes it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. When using coarse-grid deflation (see \texttt{MGEigSolverRequireConvergence}), this should usually be set to $1.0$ on all levels. (comma-separated list of positive floats, usually $ > 1.0$, default $8.0$ from the second level upwards).}
+  \item{ \texttt{MGCoarseMuFactor}: Scaling factor for twisted mass on a per-level basis, accelerates convergence and reduces condition number of coarse grid solves. From experience it seems that it's reasonable to set this $>1.0$ only on the coarsest level, but sometimes it might also help on intermediate levels. If running with twisted mass, this should always be set and tuned for maximum efficiency. When using coarse-grid deflation (see \texttt{MGEigSolverRequireConvergence}), this should usually be set to $1.0$ on all levels. (comma-separated list of positive floats, usually $ > 1.0$, default: $1.0$ on all levels).}
   \item{ \texttt{MGSetup2KappaMu}: The value of $2\kappa\mu$ which should be used during the MG setup process. This is important in the HMC for standard twisted mass fermions, for example, because the setup should always be performed with the smallest quark mass to be employed in a simulation and it might be that a monomial with a heavier twisted quark mass is the first to call to MG and to thus trigger the setup. Generally this is set to the target light twisted quark mass. Setting this to $0.0$ implies that it is ignored. (float, default: $0.0$) }
   \item{ \texttt{MGReuseSetupMuThreshold}: When the twisted quark mass is changed between solves using the MG solver, the MG setup is usually \emph{updated} for this new $\mu$ value. One can attempt to reuse the MG setup for solves with different $\mu$ values up to this threshold, i.e., when the condition $x < 2\kappa\cdot|\mu_\mathrm{old} - \mu_\mathrm{new}|$ holds. (positive float, default: \texttt{2*DBL\_EPSILON})}
   \item{ \texttt{MGRefreshSetupMDUThreshold}: When the MG is used in the HMC, the MG setup must be regularly refreshed by running a few iterations of the setup solver on the current set of approximate null vectors in order to evolve these with the changing gauge field. A good rule of thumb is to perform this setup refresh about twice per coarsest time step. In other words: for a trajectory length $\tau$ and $N$ integration steps on the coarsest time scale, the refresh should be performed at intervals of $(\tau/(2N)-\epsilon)$ MDUs, where $\epsilon$ is a small number to make sure that the threshold is hit at every half-step of the integrator. (positive float, default: \texttt{2*DBL\_EPSILON})}

From 8a20bd7d5065aa43c4eaef79b57e8a6c11d12bcc Mon Sep 17 00:00:00 2001
From: Aniket Sen <aniket.sen55@gmail.com>
Date: Wed, 29 Nov 2023 13:36:54 +0100
Subject: [PATCH 46/49] wrapper for doublet inverter

---
 include/tmLQCD.h      |  3 +++
 wrapper/lib_wrapper.c | 49 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/include/tmLQCD.h b/include/tmLQCD.h
index d304b6eca..7623d3c75 100644
--- a/include/tmLQCD.h
+++ b/include/tmLQCD.h
@@ -83,6 +83,9 @@ int tmLQCD_read_gauge(const int nconfig);
 int tmLQCD_invert(double *const propagator, double *const source, const int op_id,
                   const int write_prop);
 
+int tmLQCD_invert_doublet(double* const propagator0, double* const propagator1, double* const source0, 
+                  double* const source1, const int op_id, const int write_prop);
+
 // invert on odd part of lattice with prepared source
 int tmLQCD_invert_eo(double *const Odd_out, double *const Odd_in, const int op_id);
 
diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
index 3be5e5c39..5cee0b2b9 100644
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -278,6 +278,55 @@ int tmLQCD_invert(double* const propagator, double* const source, const int op_i
   return (0);
 }
 
+int tmLQCD_invert_doublet(double* const propagator0, double* const propagator1, double* const source0, 
+                  double* const source1, const int op_id, const int write_prop) {
+  unsigned int index_start = 0;
+  g_mu = 0.;
+
+  if (lowmem_flag && g_proc_id == 0) {
+    printf(
+        "!!! WARNING: you are calling tmLQCD_invert_doublet in \'lowmem\' mode.\n Did you make sure that "
+        "all required fields are allocated and initialised??\n");
+  }
+
+  if (!tmLQCD_invert_initialised) {
+    fprintf(stderr, "tmLQCD_invert: tmLQCD_inver_init must be called first. Aborting...\n");
+    return (-1);
+  }
+
+  if (op_id < 0 || op_id >= no_operators) {
+    fprintf(stderr, "tmLQCD_invert: op_id=%d not in valid range. Aborting...\n", op_id);
+    return (-1);
+  }
+
+  operator_list[op_id].sr0 = g_spinor_field[0];
+  operator_list[op_id].sr1 = g_spinor_field[1];
+  operator_list[op_id].sr2 = g_spinor_field[2];
+  operator_list[op_id].sr3 = g_spinor_field[3];
+  operator_list[op_id].prop0 = g_spinor_field[4];
+  operator_list[op_id].prop1 = g_spinor_field[5];
+  operator_list[op_id].prop2 = g_spinor_field[6];
+  operator_list[op_id].prop3 = g_spinor_field[7];
+
+  zero_spinor_field(operator_list[op_id].prop0, VOLUME / 2);
+  zero_spinor_field(operator_list[op_id].prop1, VOLUME / 2);
+  zero_spinor_field(operator_list[op_id].prop2, VOLUME / 2);
+  zero_spinor_field(operator_list[op_id].prop3, VOLUME / 2);
+
+  // convert to even/odd order
+  convert_lexic_to_eo(operator_list[op_id].sr0, operator_list[op_id].sr1, (spinor*)source0);
+  convert_lexic_to_eo(operator_list[op_id].sr2, operator_list[op_id].sr3, (spinor*)source1);
+
+  // invert
+  operator_list[op_id].inverter(op_id, index_start, write_prop);
+
+  // convert back to lexicographic order
+  convert_eo_to_lexic((spinor*)propagator0, operator_list[op_id].prop0, operator_list[op_id].prop1);
+  convert_eo_to_lexic((spinor*)propagator1, operator_list[op_id].prop2, operator_list[op_id].prop3);
+
+  return (0);
+}
+
 int tmLQCD_invert_eo(double* const Odd_out, double* const Odd_in, const int op_id){
   unsigned int index_start = 0;
   if (!tmLQCD_invert_initialised) {

From a7c02c7ceb7e32e5830d4caeb9bb2d67af5e11ae Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 7 Dec 2023 18:45:12 +0100
Subject: [PATCH 47/49] fix omp in reorder_mom_fromQuda

---
 quda_interface.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 2438ffd38..716649bbe 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -677,11 +677,6 @@ void reorder_mom_fromQuda() {
   // mom_quda -> mom_quda_reordered
   tm_stopwatch_push(&g_timers, __func__, "");
 
-#ifdef TM_USE_OMP
-#pragma omp parallel
-  {
-#endif
-
 #ifdef TM_USE_OMP
   #pragma omp parallel for collapse(4)
 #endif
@@ -713,9 +708,6 @@ void reorder_mom_fromQuda() {
 #endif
         }
   
-#ifdef TM_USE_OMP
-  }
-#endif
   tm_stopwatch_pop(&g_timers, 0, 0, "TM_QUDA");
 }
 

From eb0c6e114083c31051d92083166612511ab271a8 Mon Sep 17 00:00:00 2001
From: Bartosz Kostrzewa <bartosz_kostrzewa@fastmail.com>
Date: Tue, 12 Dec 2023 19:34:54 +0100
Subject: [PATCH 48/49] minor change to deal with SourceInfo.no_flavours,
 adjust some comments and error messages

---
 include/tmLQCD.h      | 3 +++
 operator.c            | 6 ++++++
 wrapper/lib_wrapper.c | 4 ++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/tmLQCD.h b/include/tmLQCD.h
index 7623d3c75..c629cdb56 100644
--- a/include/tmLQCD.h
+++ b/include/tmLQCD.h
@@ -83,6 +83,9 @@ int tmLQCD_read_gauge(const int nconfig);
 int tmLQCD_invert(double *const propagator, double *const source, const int op_id,
                   const int write_prop);
 
+// invert with source and propagator provided in TXYZ spin colour complex lexicographic order
+// propagator has kappa normalisation
+// the two propagators and sources correspond to two flavours
 int tmLQCD_invert_doublet(double* const propagator0, double* const propagator1, double* const source0, 
                   double* const source1, const int op_id, const int write_prop);
 
diff --git a/operator.c b/operator.c
index 6df4cc7cd..8ceb8d4dd 100644
--- a/operator.c
+++ b/operator.c
@@ -410,6 +410,12 @@ void op_invert(const int op_id, const int index_start, const int write_prop) {
         break;
     }
   } else if(optr->type == DBTMWILSON || optr->type == DBCLOVER) {
+    // there is no default for this set anywhere other than prepare_source.c
+    // such that calling this branch from an external program via tmLQCD_invert_doublet
+    // would result in undefined behaviour
+    // we thus set a default here unless it's explicitly set to 2
+    if( SourceInfo.no_flavours != 2 ) SourceInfo.no_flavours = 1;
+
     if(optr->type == DBCLOVER) {
       if (g_cart_id == 0 && g_debug_level > 1) {
         printf("#\n# csw = %e, computing clover leafs\n", g_c_sw);
diff --git a/wrapper/lib_wrapper.c b/wrapper/lib_wrapper.c
index 5cee0b2b9..9af22eae2 100644
--- a/wrapper/lib_wrapper.c
+++ b/wrapper/lib_wrapper.c
@@ -290,12 +290,12 @@ int tmLQCD_invert_doublet(double* const propagator0, double* const propagator1,
   }
 
   if (!tmLQCD_invert_initialised) {
-    fprintf(stderr, "tmLQCD_invert: tmLQCD_inver_init must be called first. Aborting...\n");
+    fprintf(stderr, "tmLQCD_invert_doublet: tmLQCD_inver_init must be called first. Aborting...\n");
     return (-1);
   }
 
   if (op_id < 0 || op_id >= no_operators) {
-    fprintf(stderr, "tmLQCD_invert: op_id=%d not in valid range. Aborting...\n", op_id);
+    fprintf(stderr, "tmLQCD_invert_doublet: op_id=%d not in valid range. Aborting...\n", op_id);
     return (-1);
   }
 

From 863ed0fceece7dfde72eb6b55d553a20ea6bfc2e Mon Sep 17 00:00:00 2001
From: Marcogarofalo <marcogarofalo31@gmail.com>
Date: Thu, 14 Dec 2023 21:10:38 +0100
Subject: [PATCH 49/49] remove unnecessary reset of mom_quda also in the gauge
 derivative

---
 quda_interface.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/quda_interface.c b/quda_interface.c
index 716649bbe..bdf143886 100644
--- a/quda_interface.c
+++ b/quda_interface.c
@@ -2511,11 +2511,6 @@ void compute_gauge_derivative_quda(monomial * const mnl, hamiltonian_field_t * c
     else
       loop_coeff[i] = -0.66666666666 * g_beta * mnl->c1;
   }
-  
-  #pragma omp parallel for
-  for(int i = 0; i < 4; i++){
-    memset(mom_quda[i], 0, VOLUME*10*sizeof(double));
-  }
 
   reorder_gauge_toQuda(hf->gaugefield, NO_COMPRESSION);
   // the reordering above overwrites gauge_quda