Skip to content

Commit

Permalink
Merge branch 'develop' into feature/sycl
Browse files Browse the repository at this point in the history
  • Loading branch information
jcosborn committed Nov 23, 2024
2 parents 3c3d80a + 1efcbeb commit 7c24446
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 46 deletions.
5 changes: 2 additions & 3 deletions tests/contract_ft_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,8 @@ inline int launch_contract_test(const QudaContractType cType, const std::array<i
contractFTQuda(spinorX.data(), spinorY.data(), &d_result_, cType, (void *)(&cs_param), src_colors, X.data(),
source_position.data(), n_mom, mom.data(), fft_type.data());
// Check results:
int faults
= contractionFT_reference<Float>(spinorX.data(), spinorY.data(), d_result.data(), cType,
src_colors, X.data(), source_position.data(), n_mom, mom.data(), fft_type.data());
int faults = contractionFT_reference<Float>(spinorX.data(), spinorY.data(), d_result.data(), cType, src_colors,
X.data(), source_position.data(), n_mom, mom.data(), fft_type.data());

return faults;
}
Expand Down
4 changes: 2 additions & 2 deletions tests/host_reference/contract_ft_reference.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ void contractFTHost(void **h_prop_array_flavor_1, void **h_prop_array_flavor_2,
for (int c1 = 0; c1 < src_colors; c1++) {
// color contraction
size_t off = nSpin * 3 * 2 * (Vh * parity + cb_idx);
contractColors<Float>(static_cast<Float*>(h_prop_array_flavor_1[s1 * src_colors + c1]) + off,
static_cast<Float*>(h_prop_array_flavor_2[s2 * src_colors + c1]) + off, nSpin, M.data());
contractColors<Float>(static_cast<Float *>(h_prop_array_flavor_1[s1 * src_colors + c1]) + off,
static_cast<Float *>(h_prop_array_flavor_2[s2 * src_colors + c1]) + off, nSpin, M.data());

// apply gamma matrices here

Expand Down
31 changes: 19 additions & 12 deletions tests/host_reference/gauge_force_reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,26 +437,30 @@ void gauge_force_reference_dir(void *refMom, int dir, double eb3, quda::GaugeFie
for (int i = 0; i < num_paths; i++) {
if (prec == QUDA_DOUBLE_PRECISION) {
double *my_loop_coeff = (double *)loop_coeff;
compute_path_product((dsu3_matrix *)staple, u_ex.data_array<dsu3_matrix*>().data, path_dir[i], length[i], my_loop_coeff[i],
dir, lat);
compute_path_product((dsu3_matrix *)staple, u_ex.data_array<dsu3_matrix *>().data, path_dir[i], length[i],
my_loop_coeff[i], dir, lat);
} else {
float *my_loop_coeff = (float *)loop_coeff;
compute_path_product((fsu3_matrix *)staple, u_ex.data_array<fsu3_matrix*>().data, path_dir[i], length[i], my_loop_coeff[i],
dir, lat);
compute_path_product((fsu3_matrix *)staple, u_ex.data_array<fsu3_matrix *>().data, path_dir[i], length[i],
my_loop_coeff[i], dir, lat);
}
}

if (compute_force) {
if (prec == QUDA_DOUBLE_PRECISION) {
update_mom((danti_hermitmat *)refMom, dir, u.data_array<dsu3_matrix*>().data, (dsu3_matrix *)staple, (double)eb3, lat);
update_mom((danti_hermitmat *)refMom, dir, u.data_array<dsu3_matrix *>().data, (dsu3_matrix *)staple, (double)eb3,
lat);
} else {
update_mom((fanti_hermitmat *)refMom, dir, u.data_array<fsu3_matrix*>().data, (fsu3_matrix *)staple, (float)eb3, lat);
update_mom((fanti_hermitmat *)refMom, dir, u.data_array<fsu3_matrix *>().data, (fsu3_matrix *)staple, (float)eb3,
lat);
}
} else {
if (prec == QUDA_DOUBLE_PRECISION) {
update_gauge((dsu3_matrix *)refMom, dir, u.data_array<dsu3_matrix*>().data, (dsu3_matrix *)staple, (double)eb3, lat);
update_gauge((dsu3_matrix *)refMom, dir, u.data_array<dsu3_matrix *>().data, (dsu3_matrix *)staple, (double)eb3,
lat);
} else {
update_gauge((fsu3_matrix *)refMom, dir, u.data_array<fsu3_matrix*>().data, (fsu3_matrix *)staple, (float)eb3, lat);
update_gauge((fsu3_matrix *)refMom, dir, u.data_array<fsu3_matrix *>().data, (fsu3_matrix *)staple, (float)eb3,
lat);
}
}
host_free(staple);
Expand All @@ -472,12 +476,13 @@ void gauge_force_reference(void *refMom, double eb3, quda::GaugeField &u, int **
setGaugeParam(param);
param.gauge_order = QUDA_QDP_GAUGE_ORDER;
param.t_boundary = QUDA_PERIODIC_T;

auto qdp_ex = quda::createExtendedGauge(u.data_array().data, param, R);
lattice_t lat(*qdp_ex);

for (int dir = 0; dir < 4; dir++) {
gauge_force_reference_dir(refMom, dir, eb3, u, *qdp_ex, u.Precision(), path_dir[dir], length, loop_coeff,
num_paths, lat, compute_force);
gauge_force_reference_dir(refMom, dir, eb3, u, *qdp_ex, u.Precision(), path_dir[dir], length, loop_coeff, num_paths,
lat, compute_force);
}

delete qdp_ex;
Expand All @@ -500,11 +505,13 @@ void gauge_loop_trace_reference(quda::GaugeField &u, std::vector<quda::Complex>

for (int i = 0; i < num_paths; i++) {
if (u.Precision() == QUDA_DOUBLE_PRECISION) {
dcomplex tr = compute_loop_trace(qdp_ex->data_array<dsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
dcomplex tr
= compute_loop_trace(qdp_ex->data_array<dsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
loop_tr_dbl[2 * i] = factor * tr.real;
loop_tr_dbl[2 * i + 1] = factor * tr.imag;
} else {
dcomplex tr = compute_loop_trace(qdp_ex->data_array<fsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
dcomplex tr
= compute_loop_trace(qdp_ex->data_array<fsu3_matrix *>().data, input_path[i], length[i], path_coeff[i], lat);
loop_tr_dbl[2 * i] = factor * tr.real;
loop_tr_dbl[2 * i + 1] = factor * tr.imag;
}
Expand Down
60 changes: 35 additions & 25 deletions tests/host_reference/hisq_force_reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,8 @@ void computeOneLinkSite(
#else
const int[],
#endif
int half_lattice_index, const Real *const *const oprod, int sig, Real coeff, const LoadStore<Real> &ls, Real *const *const output)
int half_lattice_index, const Real *const *const oprod, int sig, Real coeff, const LoadStore<Real> &ls,
Real *const *const output)
{
if (GOES_FORWARDS(sig)) {
typename ColorMatrix<Real>::Type colorMatW;
Expand Down Expand Up @@ -836,13 +837,13 @@ void computeMiddleLinkSite(int half_lattice_index, // half_lattice_index to bett

if (Qprev == NULL) {
if (sig_positive) {
ls.loadMatrixFromField(static_cast<const Real*const*const>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), 1 - oddBit, sig, point_d, &colorMatY);
} else {
ls.loadMatrixFromField(static_cast<const Real*const*const>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
ls.loadMatrixFromField(static_cast<const Real *const *const>(oprod), oddBit, OPP_DIR(sig), point_c, &colorMatY);
colorMatY = conj(colorMatY);
}
} else { // Qprev != NULL
ls.loadMatrixFromField(static_cast<const Real*const>(oprod), oddBit, point_c, &colorMatY);
ls.loadMatrixFromField(static_cast<const Real *const>(oprod), oddBit, point_c, &colorMatY);
}

colorMatW = (!mu_positive) ? bc_link * colorMatY : conj(bc_link) * colorMatY;
Expand Down Expand Up @@ -973,8 +974,8 @@ void computeSideLinkSite(int half_lattice_index, // half_lattice_index to better
template <class Real>
void computeSideLinkField(const int dim[4], const Real *const P3,
const Real *const Qprod, // why?
const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP,
Real *const *const newOprod)
const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
Real *const shortP, Real *const *const newOprod)
{
// Need some way of setting half_volume
int volume = 1;
Expand All @@ -999,8 +1000,8 @@ void computeSideLinkField(const int dim[4], const Real *const P3,

template <class Real, int oddBit>
void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better match the GPU code.
const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const *const link,
int sig, int mu, Real coeff, Real accumu_coeff,
const int dim[4], const Real *const oprod, const Real *const Qprev,
const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
const LoadStore<Real> &ls, // pass a function object to read from and write to matrix fields
Real *const shortP, Real *const *const newOprod)
{
Expand Down Expand Up @@ -1086,8 +1087,9 @@ void computeAllLinkSite(int half_lattice_index, // half_lattice_index to better
} // allLinkKernel

template <class Real>
void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev, const Real *const *const link,
int sig, int mu, Real coeff, Real accumu_coeff, Real *const shortP, Real *const *const newOprod)
void computeAllLinkField(const int dim[4], const Real *const oprod, const Real *const Qprev,
const Real *const *const link, int sig, int mu, Real coeff, Real accumu_coeff,
Real *const shortP, Real *const *const newOprod)
{
int volume = 1;
for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
Expand Down Expand Up @@ -1223,23 +1225,26 @@ void hisqStaplesForceCPU(const double *path_coeff, quda::GaugeField &oprod, quda
if (precision == QUDA_SINGLE_PRECISION) {
// allocate memory for temporary fields
float *tempmat[6];
for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<float*>(safe_malloc(len * 18 * precision)); }
doHisqStaplesForceCPU<float>(X_, act_path_coeff, oprod.data_array<float*>().data, link.data_array<float*>().data, tempmat, newOprod->data_array<float*>().data);
for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<float *>(safe_malloc(len * 18 * precision)); }
doHisqStaplesForceCPU<float>(X_, act_path_coeff, oprod.data_array<float *>().data, link.data_array<float *>().data,
tempmat, newOprod->data_array<float *>().data);
for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
} else if (precision == QUDA_DOUBLE_PRECISION) {
// allocate memory for temporary fields
double *tempmat[6];
for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<double*>(safe_malloc(len * 18 * precision)); }
doHisqStaplesForceCPU<double>(X_, act_path_coeff, oprod.data_array<double*>().data, link.data_array<double*>().data, tempmat, newOprod->data_array<double*>().data);
for (int i = 0; i < 6; i++) { tempmat[i] = static_cast<double *>(safe_malloc(len * 18 * precision)); }
doHisqStaplesForceCPU<double>(X_, act_path_coeff, oprod.data_array<double *>().data,
link.data_array<double *>().data, tempmat, newOprod->data_array<double *>().data);
for (int i = 0; i < 6; ++i) { host_free(tempmat[i]); }
} else {
errorQuda("Unsupported precision");
}
}

template <class Real, int oddBit>
void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const *const oprod, const Real *const *const link,
int sig, Real coeff, const LoadStore<Real> &ls, Real *const *const output)
void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *const *const oprod,
const Real *const *const link, int sig, Real coeff, const LoadStore<Real> &ls,
Real *const *const output)
{
if (GOES_FORWARDS(sig)) {

Expand Down Expand Up @@ -1286,8 +1291,8 @@ void computeLongLinkSite(int half_lattice_index, const int dim[4], const Real *c
}

template <class Real>
void computeLongLinkField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig, Real coeff,
Real *const *const output)
void computeLongLinkField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig,
Real coeff, Real *const *const output)
{
int volume = 1;
for (int dir = 0; dir < 4; ++dir) volume *= dim[dir];
Expand All @@ -1313,9 +1318,11 @@ void hisqLongLinkForceCPU(double coeff, quda::GaugeField &oprod, quda::GaugeFiel

for (int sig = 0; sig < 4; ++sig) {
if (precision == QUDA_SINGLE_PRECISION) {
computeLongLinkField<float>(X_, oprod.data_array<float*>().data, link.data_array<float*>().data, sig, coeff, newOprod->data_array<float*>().data);
computeLongLinkField<float>(X_, oprod.data_array<float *>().data, link.data_array<float *>().data, sig, coeff,
newOprod->data_array<float *>().data);
} else if (precision == QUDA_DOUBLE_PRECISION) {
computeLongLinkField<double>(X_, oprod.data_array<double*>().data, link.data_array<double*>().data, sig, coeff, newOprod->data_array<double*>().data);
computeLongLinkField<double>(X_, oprod.data_array<double *>().data, link.data_array<double *>().data, sig, coeff,
newOprod->data_array<double *>().data);
} else {
errorQuda("Unrecognised precision");
}
Expand All @@ -1329,8 +1336,8 @@ void completeForceSite(int half_lattice_index,
#else
const int[],
#endif
const Real *const *const oprod, const Real *const *const link, int sig, const LoadStore<Real> &ls,
Real *const mom)
const Real *const *const oprod, const Real *const *const link, int sig,
const LoadStore<Real> &ls, Real *const mom)
{

typename ColorMatrix<Real>::Type colorMatX, colorMatY, linkW;
Expand All @@ -1351,7 +1358,8 @@ void completeForceSite(int half_lattice_index,
}

template <class Real>
void completeForceField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig, Real *const mom)
void completeForceField(const int dim[4], const Real *const *const oprod, const Real *const *const link, int sig,
Real *const mom)
{
int volume = dim[0] * dim[1] * dim[2] * dim[3];
const int half_volume = volume / 2;
Expand All @@ -1371,9 +1379,11 @@ void hisqCompleteForceCPU(quda::GaugeField &oprod, quda::GaugeField &link, quda:

for (int sig = 0; sig < 4; ++sig) {
if (precision == QUDA_SINGLE_PRECISION) {
completeForceField<float>(X_, oprod.data_array<float*>().data, link.data_array<float*>().data, sig, mom->data<float *>());
completeForceField<float>(X_, oprod.data_array<float *>().data, link.data_array<float *>().data, sig,
mom->data<float *>());
} else if (precision == QUDA_DOUBLE_PRECISION) {
completeForceField<double>(X_, oprod.data_array<double*>().data, link.data_array<double*>().data, sig, mom->data<double *>());
completeForceField<double>(X_, oprod.data_array<double *>().data, link.data_array<double *>().data, sig,
mom->data<double *>());
} else {
errorQuda("Unrecognised precision");
}
Expand Down
4 changes: 2 additions & 2 deletions tests/host_reference/staggered_dslash_reference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,14 @@ void stag_dslash(ColorSpinorField &out, const GaugeField &fat_link, const GaugeF
long_link.Ghost()[3].data()};

if (in.Precision() == QUDA_DOUBLE_PRECISION) {
staggeredDslashReference(out.data<double*>(), reinterpret_cast<double **>(qdp_fatlink),
staggeredDslashReference(out.data<double *>(), reinterpret_cast<double **>(qdp_fatlink),
reinterpret_cast<double **>(qdp_longlink), reinterpret_cast<double **>(ghost_fatlink),
reinterpret_cast<double **>(ghost_longlink), in.data<double *>(),
reinterpret_cast<double **>(in.fwdGhostFaceBuffer),
reinterpret_cast<double **>(in.backGhostFaceBuffer), oddBit, daggerBit, dslash_type,
laplace3D);
} else if (in.Precision() == QUDA_SINGLE_PRECISION) {
staggeredDslashReference(out.data<float*>(), reinterpret_cast<float **>(qdp_fatlink),
staggeredDslashReference(out.data<float *>(), reinterpret_cast<float **>(qdp_fatlink),
reinterpret_cast<float **>(qdp_longlink), reinterpret_cast<float **>(ghost_fatlink),
reinterpret_cast<float **>(ghost_longlink), in.data<float *>(),
reinterpret_cast<float **>(in.fwdGhostFaceBuffer),
Expand Down
4 changes: 2 additions & 2 deletions tests/laph_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ auto laph_test(test_t param)
std::vector<Complex> qudaRes(nSink * nEv * Lt * nSpin, 0.);

int X[4] = {xdim, ydim, zdim, tdim};
laphSinkProject((__complex__ double *)qudaRes.data(), snkPtr.data(), nSink, tileSink,
evPtr.data(), nEv, tileEv, &invParam, X);
laphSinkProject((__complex__ double *)qudaRes.data(), snkPtr.data(), nSink, tileSink, evPtr.data(), nEv, tileEv,
&invParam, X);
printfQuda("laphSinkProject Done: %g secs, %g Gflops\n", invParam.secs, invParam.gflops / invParam.secs);

auto tol = getTolerance(cuda_prec);
Expand Down

0 comments on commit 7c24446

Please sign in to comment.