Skip to content

Commit

Permalink
[rdf] Allow for more complex file names in constructor
Browse files Browse the repository at this point in the history
Extend the checks done on the file name string in the RDataFrame constructor(s) to be more resilient against different token types (e.g. wildcards vs URL options).

Fixes #16475
  • Loading branch information
vepadulano committed Nov 12, 2024
1 parent b2182e4 commit 1c88138
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 6 deletions.
32 changes: 26 additions & 6 deletions tree/dataframe/src/RLoopManager.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -1125,15 +1125,35 @@ void RLoopManager::SetEmptyEntryRange(std::pair<ULong64_t, ULong64_t> &&newRange
*/
std::unique_ptr<TFile> OpenFileWithSanityChecks(std::string_view fileNameGlob)
{
bool fileIsGlob = [&fileNameGlob]() {
const std::vector<std::string_view> wildcards = {"[", "]", "*", "?"}; // Wildcards accepted by TChain::Add
return std::any_of(wildcards.begin(), wildcards.end(),
[&fileNameGlob](const auto &wc) { return fileNameGlob.find(wc) != std::string_view::npos; });
// Follow same logic in TChain::Add to find the correct string to look for globbing:
// - If the extension ".root" is present in the file name, pass along the basename.
// - If not, use the "?" token to delimit the part of the string which represents the basename.
// - Otherwise, pass the full filename.
auto &&baseNameAndQuery = [&fileNameGlob]() {
constexpr std::string_view delim{".root"};
if (auto &&it = std::find_end(fileNameGlob.begin(), fileNameGlob.end(), delim.begin(), delim.end());
it != fileNameGlob.end()) {
auto &&distanceToEndOfDelim = std::distance(fileNameGlob.begin(), it + delim.length());
return std::make_pair(fileNameGlob.substr(0, distanceToEndOfDelim), fileNameGlob.substr(distanceToEndOfDelim));
} else if (auto &&lastQuestionMark = fileNameGlob.find_last_of('?'); lastQuestionMark != std::string_view::npos)
return std::make_pair(fileNameGlob.substr(0, lastQuestionMark), fileNameGlob.substr(lastQuestionMark));
else
return std::make_pair(fileNameGlob, std::string_view{});
}();
// Captured structured bindings variable are only valid since C++20
auto &&baseName = baseNameAndQuery.first;
auto &&query = baseNameAndQuery.second;

const auto nameHasWildcard = [&baseName]() {
constexpr std::array<char, 4> wildCards{'[', ']', '*', '?'}; // Wildcards accepted by TChain::Add
return std::any_of(wildCards.begin(), wildCards.end(),
[&baseName](auto &&wc) { return baseName.find(wc) != std::string_view::npos; });
}();

// Open first file in case of glob, suppose all files in the glob use the same data format
std::string fileToOpen{fileIsGlob ? ROOT::Internal::TreeUtils::ExpandGlob(std::string{fileNameGlob})[0]
: fileNameGlob};
std::string fileToOpen{nameHasWildcard
? ROOT::Internal::TreeUtils::ExpandGlob(std::string{baseName})[0] + std::string{query}
: fileNameGlob};

::TDirectory::TContext ctxt; // Avoid changing gDirectory;
std::unique_ptr<TFile> inFile{TFile::Open(fileToOpen.c_str(), "READ_WITHOUT_GLOBALREGISTRATION")};
Expand Down
47 changes: 47 additions & 0 deletions tree/dataframe/test/dataframe_regression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,53 @@ TEST_P(RDFRegressionTests, UseAfterDeleteOfSampleCallbacks)
df.Count().GetValue();
}

// #16475
struct DatasetGuard {
DatasetGuard(std::string_view treeName, std::string_view fileName) : fTreeName(treeName), fFileName(fileName)
{
TFile f{fFileName.c_str(), "recreate"};
TTree t{fTreeName.c_str(), fTreeName.c_str()};
int x{};
t.Branch("x", &x, "x/I");
for (auto i = 0; i < 10; i++) {
x = i;
t.Fill();
}
f.Write();
}
DatasetGuard(const DatasetGuard &) = delete;
DatasetGuard &operator=(const DatasetGuard &) = delete;
DatasetGuard(DatasetGuard &&) = delete;
DatasetGuard &operator=(DatasetGuard &&) = delete;
~DatasetGuard() { std::remove(fFileName.c_str()); }
std::string fTreeName;
std::string fFileName;
};

TEST_P(RDFRegressionTests, FileNameQuery)
{
DatasetGuard dataset{"events", "dataframe_regression_filenamequery.root"};
constexpr auto fileNameWithQuery{"dataframe_regression_filenamequery.root?myq=xyz"};
ROOT::RDataFrame df{dataset.fTreeName, fileNameWithQuery};
EXPECT_EQ(df.Count().GetValue(), 10);
}

TEST_P(RDFRegressionTests, FileNameWildcardQuery)
{
DatasetGuard dataset{"events", "dataframe_regression_filenamewildcardquery.root"};
constexpr auto fileNameWithQuery{"dataframe_regress?on_filenamewildcardquery.root?myq=xyz"};
ROOT::RDataFrame df{dataset.fTreeName, fileNameWithQuery};
EXPECT_EQ(df.Count().GetValue(), 10);
}

TEST_P(RDFRegressionTests, FileNameQueryNoExt)
{
DatasetGuard dataset{"events", "dataframe_regression_filenamequerynoext"};
constexpr auto fileNameWithQuery{"dataframe_regression_filenamequerynoext?myq=xyz"};
ROOT::RDataFrame df{dataset.fTreeName, fileNameWithQuery};
EXPECT_EQ(df.Count().GetValue(), 10);
}

// run single-thread tests
INSTANTIATE_TEST_SUITE_P(Seq, RDFRegressionTests, ::testing::Values(false));

Expand Down

0 comments on commit 1c88138

Please sign in to comment.