diff --git a/roofit/batchcompute/src/Batches.h b/roofit/batchcompute/src/Batches.h index e461cb9e41fa9..5589199ffc677 100644 --- a/roofit/batchcompute/src/Batches.h +++ b/roofit/batchcompute/src/Batches.h @@ -23,65 +23,32 @@ so that they can contain data for every kind of compute function. #ifndef ROOFIT_BATCHCOMPUTE_BATCHES_H #define ROOFIT_BATCHCOMPUTE_BATCHES_H -#include - #include namespace RooBatchCompute { -namespace RF_ARCH { - class Batch { public: const double *__restrict _array = nullptr; bool _isVector = false; - Batch() = default; - inline Batch(InputArr array, bool isVector) : _array{array}, _isVector{isVector} {} - - __roodevice__ constexpr bool isItVector() const { return _isVector; } - inline void set(InputArr array, bool isVector) - { - _array = array; - _isVector = isVector; - } - inline void advance(std::size_t _nEvents) { _array += _isVector * _nEvents; } #ifdef __CUDACC__ - __roodevice__ constexpr double operator[](std::size_t i) const noexcept { return _isVector ? _array[i] : _array[0]; } + __device__ constexpr double operator[](std::size_t i) const noexcept { return _isVector ? _array[i] : _array[0]; } #else constexpr double operator[](std::size_t i) const noexcept { return _array[i]; } #endif // #ifdef __CUDACC__ }; -///////////////////////////////////////////////////////////////////////////////////////////////////////// - class Batches { public: - Batch *_arrays = nullptr; - double *_extraArgs = nullptr; - std::size_t _nEvents = 0; - std::size_t _nBatches = 0; - std::size_t _nExtraArgs = 0; - RestrictArr _output = nullptr; - - __roodevice__ std::size_t getNEvents() const { return _nEvents; } - __roodevice__ std::size_t getNExtraArgs() const { return _nExtraArgs; } - __roodevice__ double extraArg(std::size_t i) const { return _extraArgs[i]; } - __roodevice__ void setExtraArg(std::size_t i, double val) { _extraArgs[i] = val; } - __roodevice__ Batch operator[](int batchIdx) const { return _arrays[batchIdx]; } - inline void setNEvents(std::size_t n) { _nEvents = n; } - inline void advance(std::size_t nEvents) - { - for (std::size_t i = 0; i < _nBatches; i++) - _arrays[i].advance(nEvents); - _output += nEvents; - } + Batch *args = nullptr; + double *extra; + std::size_t nEvents = 0; + std::size_t nBatches = 0; + std::size_t nExtra = 0; + RestrictArr output = nullptr; }; -// Defines the actual argument type of the compute function. -using BatchesHandle = Batches &; - -} // End namespace RF_ARCH } // end namespace RooBatchCompute #endif // #ifdef ROOFIT_BATCHCOMPUTE_BATCHES_H diff --git a/roofit/batchcompute/src/ComputeFunctions.cxx b/roofit/batchcompute/src/ComputeFunctions.cxx index acb3509ae08d7..2dc2320196def 100644 --- a/roofit/batchcompute/src/ComputeFunctions.cxx +++ b/roofit/batchcompute/src/ComputeFunctions.cxx @@ -45,64 +45,66 @@ of performance, maximum memory coalescing. For more details, see namespace RooBatchCompute { namespace RF_ARCH { -__rooglobal__ void computeAddPdf(BatchesHandle batches) +__rooglobal__ void computeAddPdf(Batches &batches) { - const int nPdfs = batches.getNExtraArgs(); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = batches.extraArg(0) * batches[0][i]; + const int nPdfs = batches.nExtra; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = batches.extra[0] * batches.args[0][i]; + } for (int pdf = 1; pdf < nPdfs; pdf++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] += batches.extraArg(pdf) * batches[pdf][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] += batches.extra[pdf] * batches.args[pdf][i]; + } } } -__rooglobal__ void computeArgusBG(BatchesHandle batches) +__rooglobal__ void computeArgusBG(Batches &batches) { - Batch m = batches[0]; - Batch m0 = batches[1]; - Batch c = batches[2]; - Batch p = batches[3]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch m = batches.args[0]; + Batch m0 = batches.args[1]; + Batch c = batches.args[2]; + Batch p = batches.args[3]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double t = m[i] / m0[i]; const double u = 1 - t * t; - batches._output[i] = c[i] * u + p[i] * fast_log(u); + batches.output[i] = c[i] * u + p[i] * fast_log(u); } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (m[i] >= m0[i]) { - batches._output[i] = 0.0; + batches.output[i] = 0.0; } else { - batches._output[i] = m[i] * fast_exp(batches._output[i]); + batches.output[i] = m[i] * fast_exp(batches.output[i]); } } } -__rooglobal__ void computeBMixDecay(BatchesHandle batches) +__rooglobal__ void computeBMixDecay(Batches &batches) { - Batch coef0 = batches[0]; - Batch coef1 = batches[1]; - Batch tagFlav = batches[2]; - Batch delMistag = batches[3]; - Batch mixState = batches[4]; - Batch mistag = batches[5]; + Batch coef0 = batches.args[0]; + Batch coef1 = batches.args[1]; + Batch tagFlav = batches.args[2]; + Batch delMistag = batches.args[3]; + Batch mixState = batches.args[4]; + Batch mistag = batches.args[5]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = coef0[i] * (1.0 - tagFlav[i] * delMistag[0]) + coef1[i] * (mixState[i] * (1.0 - 2.0 * mistag[0])); } } -__rooglobal__ void computeBernstein(BatchesHandle batches) +__rooglobal__ void computeBernstein(Batches &batches) { - const int nCoef = batches.getNExtraArgs() - 2; + const int nCoef = batches.nExtra - 2; const int degree = nCoef - 1; - const double xmin = batches.extraArg(nCoef); - const double xmax = batches.extraArg(nCoef + 1); - Batch xData = batches[0]; + const double xmin = batches.extra[nCoef]; + const double xmax = batches.extra[nCoef + 1]; + Batch xData = batches.args[0]; // apply binomial coefficient in-place so we don't have to allocate new memory double binomial = 1.0; for (int k = 0; k < nCoef; k++) { - batches.setExtraArg(k, batches.extraArg(k) * binomial); + batches.extra[k] = batches.extra[k] * binomial; binomial = (binomial * (degree - k)) / (k + 1); } @@ -111,31 +113,31 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) double _1_X[bufferSize]; double powX[bufferSize]; double pow_1_X[bufferSize]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { powX[i] = pow_1_X[i] = 1.0; X[i] = (xData[i] - xmin) / (xmax - xmin); _1_X[i] = 1 - X[i]; - batches._output[i] = 0.0; + batches.output[i] = 0.0; } // raising 1-x to the power of degree for (int k = 2; k <= degree; k += 2) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) pow_1_X[i] *= _1_X[i] * _1_X[i]; } if (degree % 2 == 1) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) pow_1_X[i] *= _1_X[i]; } // inverting 1-x ---> 1/(1-x) - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) _1_X[i] = 1 / _1_X[i]; for (int k = 0; k < nCoef; k++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] += batches.extraArg(k) * powX[i] * pow_1_X[i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] += batches.extra[k] * powX[i] * pow_1_X[i]; // calculating next power for x and 1-x powX[i] *= X[i]; @@ -143,8 +145,8 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) } } } else { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0; const double X = (xData[i] - xmin) / (xmax - xmin); double powX = 1.0; double pow_1_X = 1.0; @@ -152,7 +154,7 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) pow_1_X *= 1 - X; const double _1_X = 1 / (1 - X); for (int k = 0; k < nCoef; k++) { - batches._output[i] += batches.extraArg(k) * powX * pow_1_X; + batches.output[i] += batches.extra[k] * powX * pow_1_X; powX *= X; pow_1_X *= _1_X; } @@ -162,52 +164,52 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) // reset extraArgs values so we don't mutate the Batches object binomial = 1.0; for (int k = 0; k < nCoef; k++) { - batches.setExtraArg(k, batches.extraArg(k) / binomial); + batches.extra[k] = batches.extra[k] / binomial; binomial = (binomial * (degree - k)) / (k + 1); } } -__rooglobal__ void computeBifurGauss(BatchesHandle batches) +__rooglobal__ void computeBifurGauss(Batches &batches) { - Batch X = batches[0]; - Batch M = batches[1]; - Batch SL = batches[2]; - Batch SR = batches[3]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch SL = batches.args[2]; + Batch SR = batches.args[3]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double arg = X[i] - M[i]; if (arg < 0) { arg /= SL[i]; } else { arg /= SR[i]; } - batches._output[i] = fast_exp(-0.5 * arg * arg); + batches.output[i] = fast_exp(-0.5 * arg * arg); } } -__rooglobal__ void computeBreitWigner(BatchesHandle batches) +__rooglobal__ void computeBreitWigner(Batches &batches) { - Batch X = batches[0]; - Batch M = batches[1]; - Batch W = batches[2]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch W = batches.args[2]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = X[i] - M[i]; - batches._output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]); + batches.output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]); } } -__rooglobal__ void computeBukin(BatchesHandle batches) +__rooglobal__ void computeBukin(Batches &batches) { - Batch X = batches[0]; - Batch XP = batches[1]; - Batch SP = batches[2]; - Batch XI = batches[3]; - Batch R1 = batches[4]; - Batch R2 = batches[5]; + Batch X = batches.args[0]; + Batch XP = batches.args[1]; + Batch SP = batches.args[2]; + Batch XI = batches.args[3]; + Batch R1 = batches.args[4]; + Batch R2 = batches.args[5]; const double r3 = log(2.0); const double r6 = exp(-6.0); const double r7 = 2 * sqrt(2 * log(2.0)); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double r1 = XI[i] * fast_isqrt(XI[i] * XI[i] + 1); const double r4 = 1 / fast_isqrt(XI[i] * XI[i] + 1); const double hp = 1 / (SP[i] * r7); @@ -231,61 +233,61 @@ __rooglobal__ void computeBukin(BatchesHandle batches) rho = R2[i]; } - batches._output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi; + batches.output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi; if (X[i] >= x1 && X[i] < x2) { - batches._output[i] = + batches.output[i] = fast_log(1 + 4 * XI[i] * r4 * (X[i] - XP[i]) * hp) / fast_log(1 + 2 * XI[i] * (XI[i] - r4)); - batches._output[i] *= -batches._output[i] * r3; + batches.output[i] *= -batches.output[i] * r3; } if (X[i] >= x1 && X[i] < x2 && XI[i] < r6 && XI[i] > -r6) - batches._output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp; + batches.output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp; } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = fast_exp(batches._output[i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = fast_exp(batches.output[i]); } -__rooglobal__ void computeCBShape(BatchesHandle batches) +__rooglobal__ void computeCBShape(Batches &batches) { - Batch M = batches[0]; - Batch M0 = batches[1]; - Batch S = batches[2]; - Batch A = batches[3]; - Batch N = batches[4]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch M = batches.args[0]; + Batch M0 = batches.args[1]; + Batch S = batches.args[2]; + Batch A = batches.args[3]; + Batch N = batches.args[4]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double t = (M[i] - M0[i]) / S[i]; if ((A[i] > 0 && t >= -A[i]) || (A[i] < 0 && -t >= A[i])) { - batches._output[i] = -0.5 * t * t; + batches.output[i] = -0.5 * t * t; } else { - batches._output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t); - batches._output[i] = fast_log(batches._output[i]); - batches._output[i] *= N[i]; - batches._output[i] -= 0.5 * A[i] * A[i]; + batches.output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t); + batches.output[i] = fast_log(batches.output[i]); + batches.output[i] *= N[i]; + batches.output[i] -= 0.5 * A[i] * A[i]; } } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = fast_exp(batches._output[i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = fast_exp(batches.output[i]); } -__rooglobal__ void computeChebychev(BatchesHandle batches) +__rooglobal__ void computeChebychev(Batches &batches) { - Batch xData = batches[0]; - const int nCoef = batches.getNExtraArgs() - 2; - const double xmin = batches.extraArg(nCoef); - const double xmax = batches.extraArg(nCoef + 1); + Batch xData = batches.args[0]; + const int nCoef = batches.nExtra - 2; + const double xmin = batches.extra[nCoef]; + const double xmax = batches.extra[nCoef + 1]; if (STEP == 1) { double prev[bufferSize][2]; double X[bufferSize]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { // set a0-->prev[i][0] and a1-->prev[i][1] // and x tranfsformed to range[-1..1]-->X[i] - prev[i][0] = batches._output[i] = 1.0; + prev[i][0] = batches.output[i] = 1.0; prev[i][1] = X[i] = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin); } for (int k = 0; k < nCoef; k++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] += prev[i][1] * batches.extraArg(k); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] += prev[i][1] * batches.extra[k]; // compute next order const double next = 2 * X[i] * prev[i][1] - prev[i][0]; @@ -294,13 +296,13 @@ __rooglobal__ void computeChebychev(BatchesHandle batches) } } } else { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double prev0 = 1.0; double prev1 = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin); double X = prev1; - batches._output[i] = 1.0; + batches.output[i] = 1.0; for (int k = 0; k < nCoef; k++) { - batches._output[i] += prev1 * batches.extraArg(k); + batches.output[i] += prev1 * batches.extra[k]; // compute next order const double next = 2 * X * prev1 - prev0; @@ -311,127 +313,127 @@ __rooglobal__ void computeChebychev(BatchesHandle batches) } } -__rooglobal__ void computeChiSquare(BatchesHandle batches) +__rooglobal__ void computeChiSquare(Batches &batches) { - Batch X = batches[0]; - const double ndof = batches.extraArg(0); + Batch X = batches.args[0]; + const double ndof = batches.extra[0]; const double gamma = 1 / std::tgamma(ndof / 2.0); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = gamma; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = gamma; constexpr double ln2 = 0.693147180559945309417232121458; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double arg = (ndof - 2) * fast_log(X[i]) - X[i] - ndof * ln2; - batches._output[i] *= fast_exp(0.5 * arg); + batches.output[i] *= fast_exp(0.5 * arg); } } -__rooglobal__ void computeDeltaFunction(BatchesHandle batches) +__rooglobal__ void computeDeltaFunction(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0 + (batches[0][i] == 1.0); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0 + (batches.args[0][i] == 1.0); } } -__rooglobal__ void computeDstD0BG(BatchesHandle batches) +__rooglobal__ void computeDstD0BG(Batches &batches) { - Batch DM = batches[0]; - Batch DM0 = batches[1]; - Batch C = batches[2]; - Batch A = batches[3]; - Batch B = batches[4]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch DM = batches.args[0]; + Batch DM0 = batches.args[1]; + Batch C = batches.args[2]; + Batch A = batches.args[3]; + Batch B = batches.args[4]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double ratio = DM[i] / DM0[i]; const double arg1 = (DM0[i] - DM[i]) / C[i]; const double arg2 = A[i] * fast_log(ratio); - batches._output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1); + batches.output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1); } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - if (batches._output[i] < 0) - batches._output[i] = 0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + if (batches.output[i] < 0) + batches.output[i] = 0; } } -__rooglobal__ void computeExpPoly(BatchesHandle batches) +__rooglobal__ void computeExpPoly(Batches &batches) { - int lowestOrder = batches.extraArg(0); - int nTerms = batches.extraArg(1); - auto x = batches[0]; + int lowestOrder = batches.extra[0]; + int nTerms = batches.extra[1]; + auto x = batches.args[0]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0; double xTmp = std::pow(x[i], lowestOrder); for (int k = 0; k < nTerms; ++k) { - batches._output[i] += batches[k + 1][i] * xTmp; + batches.output[i] += batches.args[k + 1][i] * xTmp; xTmp *= x[i]; } - batches._output[i] = std::exp(batches._output[i]); + batches.output[i] = std::exp(batches.output[i]); } } -__rooglobal__ void computeExponential(BatchesHandle batches) +__rooglobal__ void computeExponential(Batches &batches) { - Batch x = batches[0]; - Batch c = batches[1]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = fast_exp(x[i] * c[i]); + Batch x = batches.args[0]; + Batch c = batches.args[1]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = fast_exp(x[i] * c[i]); } } -__rooglobal__ void computeExponentialNeg(BatchesHandle batches) +__rooglobal__ void computeExponentialNeg(Batches &batches) { - Batch x = batches[0]; - Batch c = batches[1]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = fast_exp(-x[i] * c[i]); + Batch x = batches.args[0]; + Batch c = batches.args[1]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = fast_exp(-x[i] * c[i]); } } -__rooglobal__ void computeGamma(BatchesHandle batches) +__rooglobal__ void computeGamma(Batches &batches) { - Batch X = batches[0]; - Batch G = batches[1]; - Batch B = batches[2]; - Batch M = batches[3]; + Batch X = batches.args[0]; + Batch G = batches.args[1]; + Batch B = batches.args[2]; + Batch M = batches.args[3]; double gamma = -std::lgamma(G[0]); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (X[i] == M[i]) { - batches._output[i] = (G[i] == 1.0) / B[i]; - } else if (G.isItVector()) { - batches._output[i] = -std::lgamma(G[i]); + batches.output[i] = (G[i] == 1.0) / B[i]; + } else if (G._isVector) { + batches.output[i] = -std::lgamma(G[i]); } else { - batches._output[i] = gamma; + batches.output[i] = gamma; } } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (X[i] != M[i]) { const double invBeta = 1 / B[i]; double arg = (X[i] - M[i]) * invBeta; - batches._output[i] -= arg; + batches.output[i] -= arg; arg = fast_log(arg); - batches._output[i] += arg * (G[i] - 1); - batches._output[i] = fast_exp(batches._output[i]); - batches._output[i] *= invBeta; + batches.output[i] += arg * (G[i] - 1); + batches.output[i] = fast_exp(batches.output[i]); + batches.output[i] *= invBeta; } } } -__rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches) +__rooglobal__ void computeGaussModelExpBasis(Batches &batches) { const double root2 = std::sqrt(2.); const double root2pi = std::sqrt(2. * std::atan2(0., -1.)); - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { - const double x = batches[0][i]; - const double mean = batches[1][i] * batches[2][i]; - const double sigma = batches[3][i] * batches[4][i]; - const double tau = batches[5][i]; + const double x = batches.args[0][i]; + const double mean = batches.args[1][i] * batches.args[2][i]; + const double sigma = batches.args[3][i] * batches.args[4][i]; + const double tau = batches.args[5][i]; if (tau == 0.0) { // Straight Gaussian, used for unconvoluted PDF or expBasis with 0 lifetime @@ -439,7 +441,7 @@ __rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches) double result = std::exp(-0.5 * xprime * xprime) / (sigma * root2pi); if (!isMinus && !isPlus) result *= 2; - batches._output[i] = result; + batches.output[i] = result; } else { // Convolution with exp(-t/tau) const double xprime = (x - mean) / tau; @@ -451,52 +453,52 @@ __rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches) result += RooHeterogeneousMath::evalCerf(0, -u, c).real(); if (!isPlus) result += RooHeterogeneousMath::evalCerf(0, u, c).real(); - batches._output[i] = result; + batches.output[i] = result; } } } -__rooglobal__ void computeGaussian(BatchesHandle batches) +__rooglobal__ void computeGaussian(Batches &batches) { - auto x = batches[0]; - auto mean = batches[1]; - auto sigma = batches[2]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + auto x = batches.args[0]; + auto mean = batches.args[1]; + auto sigma = batches.args[2]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = x[i] - mean[i]; const double halfBySigmaSq = -0.5 / (sigma[i] * sigma[i]); - batches._output[i] = fast_exp(arg * arg * halfBySigmaSq); + batches.output[i] = fast_exp(arg * arg * halfBySigmaSq); } } -__rooglobal__ void computeIdentity(BatchesHandle batches) +__rooglobal__ void computeIdentity(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = batches[0][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = batches.args[0][i]; } } -__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches) +__rooglobal__ void computeNegativeLogarithms(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = -fast_log(batches[0][i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = -fast_log(batches.args[0][i]); // Multiply by weights if they exist - if (batches.extraArg(0)) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] *= batches[1][i]; + if (batches.extra[0]) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] *= batches.args[1][i]; } } -__rooglobal__ void computeJohnson(BatchesHandle batches) +__rooglobal__ void computeJohnson(Batches &batches) { - Batch mass = batches[0]; - Batch mu = batches[1]; - Batch lambda = batches[2]; - Batch gamma = batches[3]; - Batch delta = batches[4]; + Batch mass = batches.args[0]; + Batch mu = batches.args[1]; + Batch lambda = batches.args[2]; + Batch gamma = batches.args[3]; + Batch delta = batches.args[4]; const double sqrtTwoPi = std::sqrt(TMath::TwoPi()); - const double massThreshold = batches.extraArg(0); + const double massThreshold = batches.extra[0]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = (mass[i] - mu[i]) / lambda[i]; #ifdef R__HAS_VDT const double asinh_arg = fast_log(arg + 1 / fast_isqrt(arg * arg + 1)); @@ -508,7 +510,7 @@ __rooglobal__ void computeJohnson(BatchesHandle batches) delta[i] * fast_exp(-0.5 * expo * expo) * fast_isqrt(1. + arg * arg) / (sqrtTwoPi * lambda[i]); const double passThrough = mass[i] >= massThreshold; - batches._output[i] = result * passThrough; + batches.output[i] = result * passThrough; } } @@ -516,7 +518,7 @@ __rooglobal__ void computeJohnson(BatchesHandle batches) * Code copied from function landau_pdf (math/mathcore/src/PdfFuncMathCore.cxx) * and rewritten to enable vectorization. */ -__rooglobal__ void computeLandau(BatchesHandle batches) +__rooglobal__ void computeLandau(Batches &batches) { auto case0 = [](double x) { const double a1[3] = {0.04166666667, -0.01996527778, 0.02709538966}; @@ -569,82 +571,82 @@ __rooglobal__ void computeLandau(BatchesHandle batches) return u * u * (1 + (a2[0] + a2[1] * u) * u); }; - Batch X = batches[0]; - Batch M = batches[1]; - Batch S = batches[2]; + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch S = batches.args[2]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = (X[i] - M[i]) / S[i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = (X[i] - M[i]) / S[i]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (S[i] <= 0.0) { - batches._output[i] = 0; - } else if (batches._output[i] < -5.5) { - batches._output[i] = case0(batches._output[i]); - } else if (batches._output[i] < -1.0) { - batches._output[i] = case1(batches._output[i]); - } else if (batches._output[i] < 1.0) { - batches._output[i] = case2(batches._output[i]); - } else if (batches._output[i] < 5.0) { - batches._output[i] = case3(batches._output[i]); - } else if (batches._output[i] < 12.0) { - batches._output[i] = case4(batches._output[i]); - } else if (batches._output[i] < 50.0) { - batches._output[i] = case5(batches._output[i]); - } else if (batches._output[i] < 300.) { - batches._output[i] = case6(batches._output[i]); + batches.output[i] = 0; + } else if (batches.output[i] < -5.5) { + batches.output[i] = case0(batches.output[i]); + } else if (batches.output[i] < -1.0) { + batches.output[i] = case1(batches.output[i]); + } else if (batches.output[i] < 1.0) { + batches.output[i] = case2(batches.output[i]); + } else if (batches.output[i] < 5.0) { + batches.output[i] = case3(batches.output[i]); + } else if (batches.output[i] < 12.0) { + batches.output[i] = case4(batches.output[i]); + } else if (batches.output[i] < 50.0) { + batches.output[i] = case5(batches.output[i]); + } else if (batches.output[i] < 300.) { + batches.output[i] = case6(batches.output[i]); } else { - batches._output[i] = case7(batches._output[i]); + batches.output[i] = case7(batches.output[i]); } } } -__rooglobal__ void computeLognormal(BatchesHandle batches) +__rooglobal__ void computeLognormal(Batches &batches) { - Batch X = batches[0]; - Batch M0 = batches[1]; - Batch K = batches[2]; + Batch X = batches.args[0]; + Batch M0 = batches.args[1]; + Batch K = batches.args[2]; constexpr double rootOf2pi = 2.506628274631000502415765284811; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double lnxOverM0 = fast_log(X[i] / M0[i]); double lnk = fast_log(K[i]); if (lnk < 0) lnk = -lnk; double arg = lnxOverM0 / lnk; arg *= -0.5 * arg; - batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); + batches.output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); } } -__rooglobal__ void computeLognormalStandard(BatchesHandle batches) +__rooglobal__ void computeLognormalStandard(Batches &batches) { - Batch X = batches[0]; - Batch M0 = batches[1]; - Batch K = batches[2]; + Batch X = batches.args[0]; + Batch M0 = batches.args[1]; + Batch K = batches.args[2]; constexpr double rootOf2pi = 2.506628274631000502415765284811; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double lnxOverM0 = fast_log(X[i]) - M0[i]; double lnk = K[i]; if (lnk < 0) lnk = -lnk; double arg = lnxOverM0 / lnk; arg *= -0.5 * arg; - batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); + batches.output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); } } -__rooglobal__ void computeNormalizedPdf(BatchesHandle batches) +__rooglobal__ void computeNormalizedPdf(Batches &batches) { - auto rawVal = batches[0]; - auto normVal = batches[1]; + auto rawVal = batches.args[0]; + auto normVal = batches.args[1]; int nEvalErrorsType0 = 0; int nEvalErrorsType1 = 0; int nEvalErrorsType2 = 0; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double out = 0.0; - // batches._output[i] = rawVal[i] / normVar[i]; + // batches.output[i] = rawVal[i] / normVar[i]; if (normVal[i] < 0. || (normVal[i] == 0. && rawVal[i] != 0)) { // Unreasonable normalisations. A zero integral can be tolerated if the function vanishes, though. out = RooNaNPacker::packFloatIntoNaN(-normVal[i] + (rawVal[i] < 0. ? -rawVal[i] : 0.)); @@ -660,15 +662,15 @@ __rooglobal__ void computeNormalizedPdf(BatchesHandle batches) } else { out = (rawVal[i] == 0. && normVal[i] == 0.) ? 0. : rawVal[i] / normVal[i]; } - batches._output[i] = out; + batches.output[i] = out; } if (nEvalErrorsType0 > 0) - batches.setExtraArg(0, batches.extraArg(0) + nEvalErrorsType0); + batches.extra[0] = batches.extra[0] + nEvalErrorsType0; if (nEvalErrorsType1 > 1) - batches.setExtraArg(1, batches.extraArg(1) + nEvalErrorsType1); + batches.extra[1] = batches.extra[1] + nEvalErrorsType1; if (nEvalErrorsType2 > 2) - batches.setExtraArg(2, batches.extraArg(2) + nEvalErrorsType2); + batches.extra[2] = batches.extra[2] + nEvalErrorsType2; } /* TMath::ASinH(x) needs to be replaced with ln( x + sqrt(x^2+1)) @@ -679,240 +681,244 @@ __rooglobal__ void computeNormalizedPdf(BatchesHandle batches) * ln is the logarithm that was solely present in the initial * formula, that is before the asinh replacement */ -__rooglobal__ void computeNovosibirsk(BatchesHandle batches) +__rooglobal__ void computeNovosibirsk(Batches &batches) { - Batch X = batches[0]; - Batch P = batches[1]; - Batch W = batches[2]; - Batch T = batches[3]; + Batch X = batches.args[0]; + Batch P = batches.args[1]; + Batch W = batches.args[2]; + Batch T = batches.args[3]; constexpr double xi = 2.3548200450309494; // 2 Sqrt( Ln(4) ) - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double argasinh = 0.5 * xi * T[i]; double argln = argasinh + 1 / fast_isqrt(argasinh * argasinh + 1); double asinh = fast_log(argln); double argln2 = 1 - (X[i] - P[i]) * T[i] / W[i]; double ln = fast_log(argln2); - batches._output[i] = ln / asinh; - batches._output[i] *= -0.125 * xi * xi * batches._output[i]; - batches._output[i] -= 2.0 / xi / xi * asinh * asinh; + batches.output[i] = ln / asinh; + batches.output[i] *= -0.125 * xi * xi * batches.output[i]; + batches.output[i] -= 2.0 / xi / xi * asinh * asinh; } // faster if you exponentiate in a separate loop (dark magic!) - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = fast_exp(batches._output[i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = fast_exp(batches.output[i]); } -__rooglobal__ void computePoisson(BatchesHandle batches) +__rooglobal__ void computePoisson(Batches &batches) { - Batch x = batches[0]; - Batch mean = batches[1]; - bool protectNegative = batches.extraArg(0); - bool noRounding = batches.extraArg(1); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch x = batches.args[0]; + Batch mean = batches.args[1]; + bool protectNegative = batches.extra[0]; + bool noRounding = batches.extra[1]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double x_i = noRounding ? x[i] : floor(x[i]); - batches._output[i] = std::lgamma(x_i + 1.); + batches.output[i] = std::lgamma(x_i + 1.); } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double x_i = noRounding ? x[i] : floor(x[i]); const double logMean = fast_log(mean[i]); - const double logPoisson = x_i * logMean - mean[i] - batches._output[i]; - batches._output[i] = fast_exp(logPoisson); + const double logPoisson = x_i * logMean - mean[i] - batches.output[i]; + batches.output[i] = fast_exp(logPoisson); // Cosmetics if (x_i < 0) { - batches._output[i] = 0; + batches.output[i] = 0; } else if (x_i == 0) { - batches._output[i] = 1 / fast_exp(mean[i]); + batches.output[i] = 1 / fast_exp(mean[i]); } if (protectNegative && mean[i] < 0) - batches._output[i] = 1.E-3; + batches.output[i] = 1.E-3; } } -__rooglobal__ void computePolynomial(BatchesHandle batches) +__rooglobal__ void computePolynomial(Batches &batches) { - const int nCoef = batches.extraArg(0); - const std::size_t nEvents = batches.getNEvents(); - Batch x = batches[nCoef]; + const int nCoef = batches.extra[0]; + const std::size_t nEvents = batches.nEvents; + Batch x = batches.args[nCoef]; for (size_t i = BEGIN; i < nEvents; i += STEP) { - batches._output[i] = batches[nCoef - 1][i]; + batches.output[i] = batches.args[nCoef - 1][i]; } // Indexes are in range 0..nCoef-1 but coefList[nCoef-1] has already been // processed. for (int k = nCoef - 2; k >= 0; k--) { for (size_t i = BEGIN; i < nEvents; i += STEP) { - batches._output[i] = batches[k][i] + x[i] * batches._output[i]; + batches.output[i] = batches.args[k][i] + x[i] * batches.output[i]; } } } -__rooglobal__ void computePower(BatchesHandle batches) +__rooglobal__ void computePower(Batches &batches) { - const int nCoef = batches.extraArg(0); - Batch x = batches[0]; + const int nCoef = batches.extra[0]; + Batch x = batches.args[0]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0; for (int k = 0; k < nCoef; ++k) { - batches._output[i] += batches[2 * k + 1][i] * std::pow(x[i], batches[2 * k + 2][i]); + batches.output[i] += batches.args[2 * k + 1][i] * std::pow(x[i], batches.args[2 * k + 2][i]); } } } -__rooglobal__ void computeProdPdf(BatchesHandle batches) +__rooglobal__ void computeProdPdf(Batches &batches) { - const int nPdfs = batches.extraArg(0); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 1.; + const int nPdfs = batches.extra[0]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 1.; } for (int pdf = 0; pdf < nPdfs; pdf++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] *= batches[pdf][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] *= batches.args[pdf][i]; } } } -__rooglobal__ void computeRatio(BatchesHandle batches) +__rooglobal__ void computeRatio(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = batches[0][i] / batches[1][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = batches.args[0][i] / batches.args[1][i]; } } -__rooglobal__ void computeTruthModelExpBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelExpBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]); + batches.output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]); } } -__rooglobal__ void computeTruthModelSinBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelSinBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_sin(x * batches[2][i]); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * fast_sin(x * batches.args[2][i]); } } -__rooglobal__ void computeTruthModelCosBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelCosBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_cos(x * batches[2][i]); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * fast_cos(x * batches.args[2][i]); } } -__rooglobal__ void computeTruthModelLinBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelLinBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); if (isOutOfSign) { - batches._output[i] = 0.0; + batches.output[i] = 0.0; } else { - const double tscaled = std::abs(x) / batches[1][i]; - batches._output[i] = fast_exp(-tscaled) * tscaled; + const double tscaled = std::abs(x) / batches.args[1][i]; + batches.output[i] = fast_exp(-tscaled) * tscaled; } } } -__rooglobal__ void computeTruthModelQuadBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelQuadBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); if (isOutOfSign) { - batches._output[i] = 0.0; + batches.output[i] = 0.0; } else { - const double tscaled = std::abs(x) / batches[1][i]; - batches._output[i] = fast_exp(-tscaled) * tscaled * tscaled; + const double tscaled = std::abs(x) / batches.args[1][i]; + batches.output[i] = fast_exp(-tscaled) * tscaled * tscaled; } } } -__rooglobal__ void computeTruthModelSinhBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelSinhBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * sinh(x * batches[2][i] * 0.5); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * sinh(x * batches.args[2][i] * 0.5); } } -__rooglobal__ void computeTruthModelCoshBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelCoshBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * cosh(x * batches[2][i] * .5); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * cosh(x * batches.args[2][i] * .5); } } -__rooglobal__ void computeVoigtian(BatchesHandle batches) +__rooglobal__ void computeVoigtian(Batches &batches) { - Batch X = batches[0]; - Batch M = batches[1]; - Batch W = batches[2]; - Batch S = batches[3]; + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch W = batches.args[2]; + Batch S = batches.args[3]; const double invSqrt2 = 0.707106781186547524400844362105; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = (X[i] - M[i]) * (X[i] - M[i]); if (S[i] == 0.0 && W[i] == 0.0) { - batches._output[i] = 1.0; + batches.output[i] = 1.0; } else if (S[i] == 0.0) { - batches._output[i] = 1 / (arg + 0.25 * W[i] * W[i]); + batches.output[i] = 1 / (arg + 0.25 * W[i] * W[i]); } else if (W[i] == 0.0) { - batches._output[i] = fast_exp(-0.5 * arg / (S[i] * S[i])); + batches.output[i] = fast_exp(-0.5 * arg / (S[i] * S[i])); } else { - batches._output[i] = invSqrt2 / S[i]; + batches.output[i] = invSqrt2 / S[i]; } } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (S[i] != 0.0 && W[i] != 0.0) { - if (batches._output[i] < 0) - batches._output[i] = -batches._output[i]; + if (batches.output[i] < 0) + batches.output[i] = -batches.output[i]; const double factor = W[i] > 0.0 ? 0.5 : -0.5; - RooHeterogeneousMath::STD::complex z(batches._output[i] * (X[i] - M[i]), - factor * batches._output[i] * W[i]); - batches._output[i] *= RooHeterogeneousMath::faddeeva(z).real(); + RooHeterogeneousMath::STD::complex z(batches.output[i] * (X[i] - M[i]), + factor * batches.output[i] * W[i]); + batches.output[i] *= RooHeterogeneousMath::faddeeva(z).real(); } } } /// Returns a std::vector of pointers to the compute functions in this file. -std::vector getFunctions() +std::vector getFunctions() { return {computeAddPdf, computeArgusBG, diff --git a/roofit/batchcompute/src/RooBatchCompute.cu b/roofit/batchcompute/src/RooBatchCompute.cu index 91326671aecdb..ce8b10197e8ee 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cu +++ b/roofit/batchcompute/src/RooBatchCompute.cu @@ -42,27 +42,28 @@ namespace { void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs) { - batches._nEvents = nEvents; - batches._nBatches = nBatches; - batches._nExtraArgs = nExtraArgs; - batches._output = output; + batches.nEvents = nEvents; + batches.nBatches = nBatches; + batches.nExtra = nExtraArgs; + batches.output = output; } void fillArrays(Batch *arrays, VarSpan vars, double *buffer, double *bufferDevice, std::size_t nEvents) { for (int i = 0; i < vars.size(); i++) { const std::span &span = vars[i]; - if (!span.empty() && span.size() < nEvents) { + arrays[i]._isVector = span.empty() || span.size() >= nEvents; + if (!arrays[i]._isVector) { // In the scalar case, the value is not on the GPU yet, so we have to // copy the value to the GPU buffer. buffer[i] = span[0]; - arrays[i].set(bufferDevice + i, false); + arrays[i]._array = bufferDevice + i; } else { // In the vector input cases, they are already on the GPU, so we can // fill be buffer with some dummy value and set the input span // directly. buffer[i] = 0.0; - arrays[i].set(span.data(), true); + arrays[i]._array = span.data(); } } } @@ -86,13 +87,13 @@ int getGridSize(std::size_t n) } // namespace -std::vector getFunctions(); +std::vector getFunctions(); /// This class overrides some RooBatchComputeInterface functions, for the /// purpose of providing a cuda specific implementation of the library. class RooBatchComputeClass : public RooBatchComputeInterface { private: - const std::vector _computeFunctions; + const std::vector _computeFunctions; public: RooBatchComputeClass() : _computeFunctions(getFunctions()) @@ -139,11 +140,11 @@ public: fillBatches(*batches, output, nEvents, vars.size(), extraArgs.size()); fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents); - batches->_arrays = arraysDevice; + batches->args = arraysDevice; if (!extraArgs.empty()) { std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost); - batches->_extraArgs = extraArgsDevice; + batches->extra = extraArgsDevice; } copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream()); diff --git a/roofit/batchcompute/src/RooBatchCompute.cxx b/roofit/batchcompute/src/RooBatchCompute.cxx index 149ae0e2fbfac..d84c46b81936f 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cxx +++ b/roofit/batchcompute/src/RooBatchCompute.cxx @@ -44,29 +44,39 @@ namespace { void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, ArgSpan extraArgs) { - batches._extraArgs = extraArgs.data(); - batches._nEvents = nEvents; - batches._nBatches = nBatches; - batches._nExtraArgs = extraArgs.size(); - batches._output = output; + batches.extra = extraArgs.data(); + batches.nEvents = nEvents; + batches.nBatches = nBatches; + batches.nExtra = extraArgs.size(); + batches.output = output; } void fillArrays(std::span arrays, VarSpan vars, std::size_t nEvents) { for (std::size_t i = 0; i < vars.size(); i++) { - arrays[i].set(vars[i].data(), vars[i].empty() || vars[i].size() >= nEvents); + arrays[i]._array = vars[i].data(); + arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents; } } +inline void advance(Batches &batches, std::size_t nEvents) +{ + for (std::size_t i = 0; i < batches.nBatches; i++) { + Batch &arg = batches.args[i]; + arg._array += arg._isVector * nEvents; + } + batches.output += nEvents; +} + } // namespace -std::vector getFunctions(); +std::vector getFunctions(); /// This class overrides some RooBatchComputeInterface functions, for the /// purpose of providing a CPU specific implementation of the library. class RooBatchComputeClass : public RooBatchComputeInterface { private: - const std::vector _computeFunctions; + const std::vector _computeFunctions; public: RooBatchComputeClass() : _computeFunctions(getFunctions()) @@ -118,22 +128,22 @@ class RooBatchComputeClass : public RooBatchComputeInterface { std::vector arrays(vars.size()); fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs); fillArrays(arrays, vars, nEvents); - batches._arrays = arrays.data(); - batches.advance(batches.getNEvents() * idx); + batches.args = arrays.data(); + advance(batches, batches.nEvents * idx); // Set the number of events of the last Batches object as the remaining events if (idx == nThreads - 1) { - batches.setNEvents(nEvents - idx * batches.getNEvents()); + batches.nEvents = nEvents - idx * batches.nEvents; } - std::size_t events = batches.getNEvents(); - batches.setNEvents(bufferSize); + std::size_t events = batches.nEvents; + batches.nEvents = bufferSize; while (events > bufferSize) { _computeFunctions[computer](batches); - batches.advance(bufferSize); + advance(batches, bufferSize); events -= bufferSize; } - batches.setNEvents(events); + batches.nEvents = events; _computeFunctions[computer](batches); return 0; }; @@ -150,16 +160,16 @@ class RooBatchComputeClass : public RooBatchComputeInterface { std::vector arrays(vars.size()); fillBatches(batches, output, nEvents, vars.size(), extraArgs); fillArrays(arrays, vars, nEvents); - batches._arrays = arrays.data(); + batches.args = arrays.data(); - std::size_t events = batches.getNEvents(); - batches.setNEvents(bufferSize); + std::size_t events = batches.nEvents; + batches.nEvents = bufferSize; while (events > bufferSize) { _computeFunctions[computer](batches); - batches.advance(bufferSize); + advance(batches, bufferSize); events -= bufferSize; } - batches.setNEvents(events); + batches.nEvents = events; _computeFunctions[computer](batches); } }