Merge pull request #549 from etmc/quda_work_clover_force

WIP Quda work clover force
etmc · Jan 27, 2024 · 28295ac · 28295ac
2 parents 863ed0f + 6d2f3fe
commit 28295ac
Show file tree

Hide file tree

Showing 16 changed files with 439 additions and 127 deletions.
diff --git a/Makefile.in b/Makefile.in
@@ -58,7 +58,7 @@ MODULES = read_input gamma measure_gauge_action start \
 	little_D block operator \
 	spinor_fft X_psi P_M_eta \
 	jacobi fatal_error invert_clover_eo gettime \
-	tm_debug_printf \
+	tm_debug_printf compare_derivative \
         @SPI_FILES@ @QUDA_INTERFACE@ @DDalphaAMG_INTERFACE@
 
 CXXMODULES = @QPHIX_INTERFACE@

diff --git a/compare_derivative.c b/compare_derivative.c
@@ -0,0 +1,79 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2024 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include<tmlqcd_config.h>
+#endif
+#ifdef TM_USE_OMP 
+# include <omp.h>
+#endif
+#include <stdio.h>
+#include "global.h"
+#include "monomial/monomial.h"
+
+/* this function compares two derivatives calculated by an external library and tmLQCD */
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, 
+    const double threshold, const char * name){
+  int n_diff = 0;
+
+  for(int ix = 0; ix < VOLUME; ix++){
+    for(int mu=0; mu<4; mu++){
+      double *ext=&(ext_lib[ix][mu].d1);
+      double *nat=&(native[ix][mu].d1);
+      for(int j=0; j<8; ++j){
+        double diff=ext[j]-nat[j];
+        if (sqrt(diff*diff) > threshold || isnan( ext[j] ) || isinf(ext[j]) ){
+            n_diff++;
+            printf("derivative at (t,x,y,z,mu,j) %d,%d,%d,%d,%d,%d,"
+                   " ext: %-14e, native: %-14e ratio: %-14g diff %-14g  on proc_id %d\n", 
+                  g_coord[ix][0], g_coord[ix][1], g_coord[ix][2], g_coord[ix][3], mu, j,
+                  ext[j], nat[j], ext[j]/nat[j], ext[j]-nat[j], g_proc_id);
+        }
+      }
+    }
+  }
+  if(n_diff > 0){
+    printf("%s: the deviation between tmLQCD and the external library "
+           "exceeds the threshold %.1e in %d case(s) for parameters: c0=%e c1=%e g_beta=%e on proc_id: %d\n",
+           name,
+           threshold,
+           n_diff,
+           mnl->c0,
+           mnl->c1,
+           mnl->beta,
+           g_proc_id);
+
+    if(g_strict_residual_check) fatal_error("Difference between external library and tmLQCD-native function!", 
+                                            name);
+  }
+
+  int red_n_diff = 0;
+#ifdef TM_USE_MPI
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Reduce(&n_diff, &red_n_diff, 1, MPI_INT, MPI_MAX, 0, MPI_COMM_WORLD);
+#else
+  red_n_diff = n_diff;
+#endif
+  if(g_proc_id == 0){
+    printf("The maximum number of deviations in %s exceeding the threshold %.1e was %d\n",
+           name, threshold, red_n_diff);
+  }
+}
+
diff --git a/compare_derivative.h b/compare_derivative.h
@@ -0,0 +1,29 @@
+/***********************************************************************
+ *
+ * Copyright (C) 2024 Bartosz Kostrzewa
+ *
+ * This file is part of tmLQCD.
+ *
+ * tmLQCD is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * tmLQCD is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with tmLQCD.  If not, see <http://www.gnu.org/licenses/>.
+ ***********************************************************************/
+
+#ifndef COMPARE_DERIVATIVE_H
+#define COMPARE_DERIVATIVE_H
+
+#include "monomial/monomial.h"
+#include "su3adj.h"
+
+void compare_derivative(monomial *mnl, su3adj **ext_lib, su3adj **native, const double threshold, const char * name);
+
+#endif
diff --git a/configure.in b/configure.in
@@ -964,6 +964,16 @@ if test $enable_quda_experimental = yes; then
 else
   AC_MSG_RESULT(no)
 fi
+AC_MSG_CHECKING(whether the QUDA force is enabled)
+AC_ARG_ENABLE(quda_fermionic_forces,
+  AS_HELP_STRING([--enable-quda_fermionic_forces], [enable support for fermionic forces using QUDA [default=yes]]),
+  enable_quda_fermionic_forces=$enableval, enable_quda_fermionic_forces=yes)
+if test $enable_quda_fermionic_forces = no; then
+  AC_MSG_RESULT(no)
+else
+  AC_MSG_RESULT(yes)
+  AC_DEFINE(TM_QUDA_FERMIONIC_FORCES,1, fermionic forces with QUDA are enabled)
+fi
 
 # QPhiX library for Intel Xeon and Xeon Phis
 AC_MSG_CHECKING(whether we want to use QPhiX)

diff --git a/doc/input.tex b/doc/input.tex
@@ -425,17 +425,10 @@ \subsection{Input parameter for main program}
 Each of them has different options
 :
 \begin{itemize}
-\item {\ttfamily DET, CLOVERDET}:
-  \begin{itemize}
-  \item {\ttfamily 2KappaMu}
-  \end{itemize}
-\item {\ttfamily CLOVERDET}:
-  \begin{itemize}
-  \item {\ttfamily csw}
-  \end{itemize}
 \item {\ttfamily DET, CLOVERDET}:
   \begin{itemize}
   \item {\ttfamily Kappa}
+  \item {\ttfamily 2KappaMu}
   \item {\ttfamily Timescale}: the timescale on which to integrate
     this monomial. Counting starts from zero up to the total number of
     timescales minus 1.
@@ -454,8 +447,17 @@ \subsection{Input parameter for main program}
   \item {\ttfamily HB\_Solver}: the solver to be used in the heatbath step, see section \ref{sec:hb.solver} for details.
   \item {\ttfamily Name}: a name to be assigned to the monomial. The
     default is {\ttfamily DET}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 %
+\item {\ttfamily CLOVERDET}:
+\begin{itemize}
+	\item {\ttfamily csw}
+	\item {\ttfamily UseExternalLibrary} 
+	Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
+\end{itemize}
+%
 \item {\ttfamily DETRATIO}: the same as for {\ttfamily DET}, but in
   addition:
   \begin{itemize}
@@ -464,10 +466,13 @@ \subsection{Input parameter for main program}
   \item {\ttfamily Name}: 
     a name to be assigned to the monomial. 
     The default is {\ttfamily DETRATIO}
+  \item {\ttfamily UseExternalInverter} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
 %
 \item {\ttfamily CLOVERDETRATIO}: 
   see {\ttfamily CLOVERDET} and {\ttfamily DETRATIO}.
+
 %
 \item {\ttfamily GAUGE}: 
   \begin{itemize}
@@ -490,6 +495,8 @@ \subsection{Input parameter for main program}
   \item {\ttfamily RectangleCoefficient}: the value of the parameter
     $c_1$. The coefficient $c_0$ is computed from $c_0 = 1-8c_1$. Is
     effective only for {\ttfamily type = user}.
+   \item {\ttfamily UseExternalLibrary} 
+  Equal to either {\ttfamily no} (default value) or {\ttfamily quda}.
   \end{itemize}
   There is maximally one instance allowed of this type.
 

diff --git a/doc/quda.tex b/doc/quda.tex
@@ -13,7 +13,9 @@ \subsubsection{Design goals of the interface}
 \begin{enumerate}
 	\item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. 
 	This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines.
-\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. In the HMC, the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials.
+\item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseExternalInverter} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER, DBCLOVER} are supported. 
+    Within the monomial declarations of the input file (between {\ttfamily BeginMonomial} and {\ttfamily EndMonomial}) the same flag can be used to offload solves for the \texttt{DET, DETRATIO, CLOVERDET, CLOVERDETRATIO, RAT, RATCOR, NDRAT, NDRATCOR, NDCLOVERRAT} and \texttt{NDCLOVERRATCOR} monomials in the HMC.
+    Further, the flag {\ttfamily UseExternalLibrary} is introduced which, when set to {\ttfamily quda}, will let QUDA perform the force calculation for the given monomial with support currently limited to {\ttfamily GAUGE, CLOVERDET, CLOVERDETRATIO}.
 	\item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). The QUDA interface is entered .
 	\item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular:
 	\begin{itemize}
@@ -68,6 +70,22 @@ \subsubsection{Installation}
 \end{verbatim}
 Note that a {\ttfamily C++} compiler is required for linking against the QUDA library, therefore set {\ttfamily CXX} appropriately. {\ttfamily \${QUDADIR}} is where you installed QUDA in the previous step and {\ttfamily \${CUDADIR}} is required again for linking.
 
+\subsubsection{QUDA versions}
+
+If you need a version of QUDA after https://github.com/lattice/quda/commit/50864ffde1bd8f46fd4a2a2b2e6d44a5a588e2c2 you nee to configure with 
+\begin{verbatim}
+  --enable-quda_experimental=yes
+\end{verbatim}
+
+If you need a version of QUDA before \url{https://github.com/lattice/quda/commit/fd50676db06fc36efb3a791a3059c57cca70bb55} you need to add in the configuration script the option
+\begin{verbatim}
+  --enable-quda_fermionic_forces=no
+\end{verbatim}
+so that the wrapper to the QUDA fermionic forces is not compiled,
+thus if \texttt{--enable-quda_fermionic_forces=no} setting {\ttfamily UseExternalLibrary=yes} in the inputfile for the {\ttfamily  CLOVERDET, CLOVERDETRATIO} monomials
+is not supported and tmLQCD will stop with an error.
+
+
 \subsubsection{Usage}
 Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseExternalInverter} flag to {\ttfamily quda}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as
 \begin{verbatim}

diff --git a/global.h b/global.h
@@ -194,7 +194,7 @@ EXTERN su3_32 ** g_gauge_field_copy_32;
 
 EXTERN su3adj ** moment;
 EXTERN su3adj ** df0;
-EXTERN su3adj ** ddummy;
+EXTERN su3adj ** ddummy, ** debug_derivative;
 
 EXTERN int count00,count01,count10,count11,count20,count21;
 EXTERN double g_kappa, g_c_sw, g_beta;

diff --git a/include/tmlqcd_config_internal.h.in b/include/tmlqcd_config_internal.h.in
@@ -203,6 +203,9 @@
 /* Using experimental QUDA version */
 #undef TM_QUDA_EXPERIMENTAL
 
+/* Using QUDA fermionic forces */
+#undef TM_QUDA_FERMIONIC_FORCES
+
 /* Using DDalphaAMG */
 #undef DDalphaAMG
 

diff --git a/init/init_moment_field.c b/init/init_moment_field.c
@@ -28,7 +28,7 @@
 #include "su3adj.h"
 #include "sse.h"
 
-su3adj * mo=NULL, *df=NULL, *du=NULL;
+su3adj * mo=NULL, *df=NULL, *du=NULL, *du_internal=NULL;
 
 int init_moment_field(const int V, const int VR) {
   int i = 0;
@@ -94,6 +94,27 @@ int init_moment_field(const int V, const int VR) {
     ddummy[i] = ddummy[i-1]+4;
   }
 
+  if(g_debug_level>3){
+    if((void*)(du_internal = (su3adj*)calloc(4*VR+1, sizeof(su3adj))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(5);
+    }
+    if((void*)(debug_derivative = (su3adj**)calloc(VR,sizeof(su3adj*))) == NULL) {
+      printf ("malloc errno : %d\n",errno); 
+      errno = 0;
+      return(6);
+    }
+#if ( defined SSE || defined SSE2 || defined SSE3)
+    debug_derivative[0] = (su3adj*)(((unsigned long int)(du_internal)+ALIGN_BASE)&~ALIGN_BASE);
+#else
+    debug_derivative[0] = du_internal;
+#endif
+
+    for(i = 1; i < VR; i++){
+      debug_derivative[i] = debug_derivative[i-1]+4;
+    }
+  }
   return(0);
 }