From fdb6e962ce95edbb712497db96dc46e925d62198 Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Mon, 4 Mar 2024 09:09:13 +0100 Subject: [PATCH] [RF] Avoid using static `std::vector` buffer in RooBatchCompute In the RooBatchCompute CPU library, all scalar inputs have to be copied n times into a buffer that is as long as the SIMD registers, to allow for vectorization in all cases. To avoid frequent memory allocations, this buffer was made a `static` variable in the original implementation of the batchcompute library, which of course made it non-threadsafe. This is now hitting us, because RooFit needs to be able to do multiple fits concurrently. This is a requirement for CMSSW, and a blocker for ROOT master adoption in CMSSW since the new CPU backend is the default: https://github.com/cms-sw/cmsdist/pull/9034 This commit fixes the concurrency problem by doing the buffering in the DataMaps that are used in the `RooFit::Evaluator`. Like this, multiple computation graphs can be concurrently evaluated. It was tested with the ATLAS benchmarks in `rootbench` that the fitting performance remains the same. --- roofit/batchcompute/res/RooBatchCompute.h | 2 + roofit/batchcompute/src/Batches.h | 2 - roofit/batchcompute/src/RooBatchCompute.cxx | 27 ++++--------- roofit/roofitcore/inc/RooFit/Detail/DataMap.h | 11 ++++-- .../roofitcore/src/RooFit/Detail/DataMap.cxx | 39 +++++++++++++++++-- roofit/roofitcore/src/RooFit/Evaluator.cxx | 5 +++ roofit/roofitcore/src/RooPolyVar.cxx | 11 ++++-- 7 files changed, 65 insertions(+), 32 deletions(-) diff --git a/roofit/batchcompute/res/RooBatchCompute.h b/roofit/batchcompute/res/RooBatchCompute.h index a7b5c38914f93..c8554331aae6e 100644 --- a/roofit/batchcompute/res/RooBatchCompute.h +++ b/roofit/batchcompute/res/RooBatchCompute.h @@ -45,6 +45,8 @@ typedef std::span ArgSpan; typedef double *__restrict RestrictArr; typedef const double *__restrict InputArr; +constexpr std::size_t bufferSize = 64; + void init(); /// Minimal configuration struct to steer the evaluation of a single node with diff --git a/roofit/batchcompute/src/Batches.h b/roofit/batchcompute/src/Batches.h index 69c4a7837bd53..e461cb9e41fa9 100644 --- a/roofit/batchcompute/src/Batches.h +++ b/roofit/batchcompute/src/Batches.h @@ -29,8 +29,6 @@ so that they can contain data for every kind of compute function. namespace RooBatchCompute { -constexpr std::size_t bufferSize = 64; - namespace RF_ARCH { class Batch { diff --git a/roofit/batchcompute/src/RooBatchCompute.cxx b/roofit/batchcompute/src/RooBatchCompute.cxx index d56735dd7d897..149ae0e2fbfac 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cxx +++ b/roofit/batchcompute/src/RooBatchCompute.cxx @@ -51,20 +51,10 @@ void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size batches._output = output; } -void fillArrays(std::vector &arrays, VarSpan vars, double *buffer, std::size_t nEvents) +void fillArrays(std::span arrays, VarSpan vars, std::size_t nEvents) { - - arrays.resize(vars.size()); - for (size_t i = 0; i < vars.size(); i++) { - const std::span &span = vars[i]; - if (!span.empty() && span.size() < nEvents) { - // In the scalar case, copy the value to each element of vector input - // buffer. - std::fill_n(&buffer[i * bufferSize], bufferSize, span.data()[0]); - arrays[i].set(&buffer[i * bufferSize], false); - } else { - arrays[i].set(span.data(), true); - } + for (std::size_t i = 0; i < vars.size(); i++) { + arrays[i].set(vars[i].data(), vars[i].empty() || vars[i].size() >= nEvents); } } @@ -112,9 +102,6 @@ class RooBatchComputeClass : public RooBatchComputeInterface { void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars, ArgSpan extraArgs) override { - static std::vector buffer; - buffer.resize(vars.size() * bufferSize); - if (ROOT::IsImplicitMTEnabled()) { ROOT::Internal::TExecutor ex; std::size_t nThreads = ex.GetPoolSize(); @@ -128,9 +115,9 @@ class RooBatchComputeClass : public RooBatchComputeInterface { // Fill a std::vector with the same object and with ~nEvents/nThreads // Then advance every object but the first to split the work between threads Batches batches; - std::vector arrays; + std::vector arrays(vars.size()); fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs); - fillArrays(arrays, vars, buffer.data(), nEvents); + fillArrays(arrays, vars, nEvents); batches._arrays = arrays.data(); batches.advance(batches.getNEvents() * idx); @@ -160,9 +147,9 @@ class RooBatchComputeClass : public RooBatchComputeInterface { // Fill a std::vector with the same object and with ~nEvents/nThreads // Then advance every object but the first to split the work between threads Batches batches; - std::vector arrays; + std::vector arrays(vars.size()); fillBatches(batches, output, nEvents, vars.size(), extraArgs); - fillArrays(arrays, vars, buffer.data(), nEvents); + fillArrays(arrays, vars, nEvents); batches._arrays = arrays.data(); std::size_t events = batches.getNEvents(); diff --git a/roofit/roofitcore/inc/RooFit/Detail/DataMap.h b/roofit/roofitcore/inc/RooFit/Detail/DataMap.h index 007431e4726e4..2a9d933b9051a 100644 --- a/roofit/roofitcore/inc/RooFit/Detail/DataMap.h +++ b/roofit/roofitcore/inc/RooFit/Detail/DataMap.h @@ -82,10 +82,7 @@ namespace Detail { class DataMap { public: - auto size() const - { - return _dataMap.size(); - } + auto size() const { return _dataMap.size(); } void resize(std::size_t n); inline void set(RooAbsArg const *arg, std::span const &span) @@ -119,8 +116,14 @@ class DataMap { RooBatchCompute::Config config(RooAbsArg const *arg) const; + void enableVectorBuffers(bool enable) { _enableVectorBuffers = enable; } + void resetVectorBuffers() { _bufferIdx = 0; } + private: std::vector> _dataMap; + bool _enableVectorBuffers = false; + std::vector> _buffers; + std::size_t _bufferIdx = 0; std::vector _cfgs; }; diff --git a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx index b73f767d465bf..49600ea7977f7 100644 --- a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx +++ b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx @@ -16,17 +16,50 @@ #include #include +#include + +namespace { + +// To avoid deleted move assignment. +template +void assignSpan(std::span &to, std::span const &from) +{ + to = from; +} + +} // namespace + namespace RooFit { namespace Detail { std::span DataMap::at(RooAbsArg const *arg, RooAbsArg const * /*caller*/) { + std::span out; + if (!arg->hasDataToken()) { auto var = static_cast(arg); - return {&var->_value, 1}; + assignSpan(out, {&var->_value, 1}); + } else { + std::size_t idx = arg->dataToken(); + out = _dataMap[idx]; } - std::size_t idx = arg->dataToken(); - return _dataMap[idx]; + + if (!_enableVectorBuffers || out.size() != 1) { + return out; + } + + if (_bufferIdx == _buffers.size()) { + _buffers.emplace_back(RooBatchCompute::bufferSize); + } + + double *buffer = _buffers[_bufferIdx].data(); + + std::fill_n(buffer, RooBatchCompute::bufferSize, out[0]); + assignSpan(out, {buffer, 1}); + + ++_bufferIdx; + + return out; } void DataMap::setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config) diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx index d7734c6edfbfa..ff018251ff106 100644 --- a/roofit/roofitcore/src/RooFit/Evaluator.cxx +++ b/roofit/roofitcore/src/RooFit/Evaluator.cxx @@ -384,7 +384,12 @@ void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info) buffer = info.buffer->cpuWritePtr(); } _dataMapCPU.set(node, {buffer, nOut}); + if (nOut > 1) { + _dataMapCPU.enableVectorBuffers(true); + } nodeAbsReal->computeBatch(buffer, nOut, _dataMapCPU); + _dataMapCPU.resetVectorBuffers(); + _dataMapCPU.enableVectorBuffers(false); #ifdef ROOFIT_CUDA if (info.copyAfterEvaluation) { _dataMapCUDA.set(node, {info.buffer->gpuReadPtr(), nOut}); diff --git a/roofit/roofitcore/src/RooPolyVar.cxx b/roofit/roofitcore/src/RooPolyVar.cxx index 88dff388026c4..ae6a48176e871 100644 --- a/roofit/roofitcore/src/RooPolyVar.cxx +++ b/roofit/roofitcore/src/RooPolyVar.cxx @@ -36,6 +36,7 @@ it can define. #include "TError.h" +#include #include #include @@ -139,10 +140,14 @@ void RooPolyVar::computeBatchImpl(RooAbsArg const* caller, double *output, size_ // Fill the coefficients for the skipped orders. By a conventions started in // RooPolynomial, if the zero-th order is skipped, it implies a coefficient // for the constant term of one. - const double zero = 1.0; - const double one = 1.0; + std::array zeros; + std::array ones; + std::fill_n(zeros.data(), zeros.size(), 0.0); + std::fill_n(ones.data(), ones.size(), 1.0); + std::span zerosSpan{zeros.data(), 1}; + std::span onesSpan{ones.data(), 1}; for (int i = lowestOrder - 1; i >= 0; --i) { - vars.push_back(i == 0 ? std::span{&one, 1} : std::span{&zero, 1}); + vars.push_back(i == 0 ? onesSpan : zerosSpan); } for (RooAbsArg *coef : coefs) {