diff --git a/python/distrdf/backends/check_backend.py b/python/distrdf/backends/check_backend.py index 42477eda12..fc8fe36f1b 100644 --- a/python/distrdf/backends/check_backend.py +++ b/python/distrdf/backends/check_backend.py @@ -123,7 +123,7 @@ def init(value): df = df.Define("u", "userValue").Histo1D( ("name", "title", 1, 100, 130), "u") - + h = df.GetValue() assert h.GetMean() == 123 @@ -133,22 +133,23 @@ class TestEmptyTreeError: Tests with emtpy trees. """ - def test_histo_from_empty_root_file(self, payload): + @pytest.mark.parametrize("datasource", ["ttree","rntuple"]) + def test_histo_from_empty_root_file(self, payload, datasource): """ Check that when performing operations with the distributed backend on an RDataFrame without entries, DistRDF raises an error. """ connection, backend = payload + datasetname = "empty" + filename = f"../data/{datasource}/empty.root" # Create an RDataFrame from a file with an empty tree if backend == "dask": RDataFrame = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame - rdf = RDataFrame( - "empty", "../data/ttree/empty.root", daskclient=connection) + rdf = RDataFrame(datasetname, filename, daskclient=connection) elif backend == "spark": RDataFrame = ROOT.RDF.Experimental.Distributed.Spark.RDataFrame - rdf = RDataFrame("empty", "../data/ttree/empty.root", - sparkcontext=connection) + rdf = RDataFrame(datasetname, filename, sparkcontext=connection) histo = rdf.Histo1D(("empty", "empty", 10, 0, 10), "mybranch") # Get entries in the histogram, raises error @@ -161,7 +162,6 @@ def test_count_with_some_empty_trees(self, payload): not contribute to how many entries are processed in the distributed execution. """ - connection, backend = payload treenames = [f"tree_{i}" for i in range(3)] filenames = [ @@ -200,21 +200,22 @@ class TestWithRepeatedTree: is used multiple times. """ - def test_count_with_same_tree_repeated(self, payload): + @pytest.mark.parametrize("datasource", ["ttree","rntuple"]) + def test_count_with_same_tree_repeated(self, payload, datasource): """ Count entries of a dataset with three times the same tree. """ connection, backend = payload - treename = "tree_0" - filename = "../data/ttree/distrdf_roottest_check_backend_0.root" + datasetname = "tree_0" + filename = f"../data/{datasource}/distrdf_roottest_check_backend_0.root" filenames = [filename] * 3 if backend == "dask": RDataFrame = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame - rdf = RDataFrame(treename, filenames, daskclient=connection) + rdf = RDataFrame(datasetname, filenames, daskclient=connection) elif backend == "spark": RDataFrame = ROOT.RDF.Experimental.Distributed.Spark.RDataFrame - rdf = RDataFrame(treename, filenames, sparkcontext=connection) + rdf = RDataFrame(datasetname, filenames, sparkcontext=connection) assert rdf.Count().GetValue() == 300 diff --git a/python/distrdf/backends/check_cloned_actions.py b/python/distrdf/backends/check_cloned_actions.py index bfd45a1880..3529a24508 100644 --- a/python/distrdf/backends/check_cloned_actions.py +++ b/python/distrdf/backends/check_cloned_actions.py @@ -10,31 +10,39 @@ class TestAsNumpy: distributed configurations. """ - @pytest.mark.parametrize("nparts", range(1, 21)) - def test_clone_asnumpyresult(self, payload, nparts): + @pytest.mark.parametrize("nfiles", [1, 3, 7]) + @pytest.mark.parametrize("nparts", [1, 2, 3, 7, 8, 15, 16, 21]) + @pytest.mark.parametrize("datasource", ["ttree", "rntuple"]) + def test_clone_asnumpyresult(self, payload, nfiles, nparts, datasource): """ Test that the correct values of the numpy array are retrieved from distributed execution irrespective of the number of partitions. """ datasetname = "Events" - filename = "../data/ttree/distrdf_roottest_check_cloned_actions_asnumpy.root" + filename = f"../data/{datasource}/distrdf_roottest_check_cloned_actions_asnumpy.root" + inputfiles = [filename] * nfiles connection, backend = payload if backend == "dask": RDataFrame = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame - distrdf = RDataFrame(datasetname, filename, + distrdf = RDataFrame(datasetname, inputfiles, daskclient=connection, npartitions=nparts) elif backend == "spark": RDataFrame = ROOT.RDF.Experimental.Distributed.Spark.RDataFrame - distrdf = RDataFrame(datasetname, filename, + distrdf = RDataFrame(datasetname, inputfiles, sparkcontext=connection, npartitions=nparts) - localdf = ROOT.RDataFrame("Events", filename) + localdf = ROOT.RDataFrame("Events", inputfiles) vals_distrdf = distrdf.AsNumpy(["event"]) vals_localdf = localdf.AsNumpy(["event"]) - assert all(vals_localdf["event"] == numpy.sort(vals_distrdf["event"])) + # Distributed mode does not guarantee the order of execution of the tasks + # thus the output numpy array is unsorted. We also sort the output array + # of the local execution so that in case we test with multiple files + # the values of the arrays can be properly aligned (otherwise it would + # always fail). + assert all(numpy.sort(vals_localdf["event"]) == numpy.sort(vals_distrdf["event"])) if __name__ == "__main__": diff --git a/python/distrdf/backends/check_histo_write.py b/python/distrdf/backends/check_histo_write.py index 7cc08321a3..33dbbaecbc 100644 --- a/python/distrdf/backends/check_histo_write.py +++ b/python/distrdf/backends/check_histo_write.py @@ -16,7 +16,8 @@ class TestDaskHistoWrite: gaus_stdev = 1 delta_equal = 0.01 - def test_write_histo(self, payload): + @pytest.mark.parametrize("datasource", ["ttree","rntuple"]) + def test_write_histo(self, payload, datasource): """ Tests that an histogram is correctly written to a .root file created before the execution of the event loop. @@ -26,7 +27,7 @@ def test_write_histo(self, payload): with ROOT.TFile("out_file.root", "recreate") as outfile: # We can reuse the same dataset from another test treename = "T" - filename = "../data/ttree/distrdf_roottest_check_friend_trees_main.root" + filename = f"../data/{datasource}/distrdf_roottest_check_friend_trees_main.root" # Create a DistRDF RDataFrame with the parent and the friend trees connection, backend = payload if backend == "dask": diff --git a/python/distrdf/backends/check_reducer_merge.py b/python/distrdf/backends/check_reducer_merge.py index 5221c6da08..7640cd53eb 100644 --- a/python/distrdf/backends/check_reducer_merge.py +++ b/python/distrdf/backends/check_reducer_merge.py @@ -490,12 +490,13 @@ def test_redefine_one_column(self, payload): assert sum_before.GetValue() == 10.0 assert sum_after.GetValue() == 20.0 - def test_distributed_stddev(self, payload): + @pytest.mark.parametrize("datasource", ["ttree","rntuple"]) + def test_distributed_stddev(self, payload, datasource): """Test support for the StdDev action.""" # Create dataset with fixed series of entries treename = "tree" - filename = "../data/ttree/distrdf_roottest_check_reducer_merge_1.root" + filename = f"../data/{datasource}/distrdf_roottest_check_reducer_merge_1.root" connection, backend = payload if backend == "dask": @@ -511,11 +512,12 @@ def test_distributed_stddev(self, payload): assert std.GetValue() == pytest.approx(expected, rel), f"{std.GetValue()}!={expected}" - def test_distributed_stats(self, payload): + @pytest.mark.parametrize("datasource", ["ttree","rntuple"]) + def test_distributed_stats(self, payload, datasource): """Test support for the Stats action.""" # Create dataset with fixed series of entries treename = "tree" - filename = "../data/ttree/distrdf_roottest_check_reducer_merge_1.root" + filename = f"../data/{datasource}/distrdf_roottest_check_reducer_merge_1.root" connection, backend = payload if backend == "dask": diff --git a/python/distrdf/backends/check_rungraphs.py b/python/distrdf/backends/check_rungraphs.py index 17152e19f1..c34b453db7 100644 --- a/python/distrdf/backends/check_rungraphs.py +++ b/python/distrdf/backends/check_rungraphs.py @@ -8,13 +8,14 @@ class TestRunGraphs: """Tests usage of RunGraphs function with Dask backend""" - def test_rungraphs_dask_3histos(self, payload): + @pytest.mark.parametrize("datasource", ["ttree","rntuple"]) + def test_rungraphs_dask_3histos(self, payload, datasource): """ Submit three different Dask RDF graphs concurrently """ # Create a test file for processing treename = "tree" - filename = "../data/ttree/distrdf_roottest_check_rungraphs.root" + filename = f"../data/{datasource}/distrdf_roottest_check_rungraphs.root" nentries = 10000 connection, backend = payload if backend == "dask": diff --git a/python/distrdf/data/rntuple/_create_datasets.C b/python/distrdf/data/rntuple/_create_datasets.C new file mode 100644 index 0000000000..73c492a41b --- /dev/null +++ b/python/distrdf/data/rntuple/_create_datasets.C @@ -0,0 +1,221 @@ +#include +#include +#include +#include +#include + +// Import classes from experimental namespace for the time being +using RNTupleModel = ROOT::Experimental::RNTupleModel; +using RNTupleWriter = ROOT::Experimental::RNTupleWriter; + +void create_check_backend() +{ + + auto every_n_entries = 10; + std::vector ntpl_names{"tree_0", "tree_1", "tree_2"}; + std::vector filenames{ + "distrdf_roottest_check_backend_0.root", + "distrdf_roottest_check_backend_1.root", + "distrdf_roottest_check_backend_2.root"}; + + for (auto i = 0; i < ntpl_names.size(); i++) + { + auto model = RNTupleModel::Create(); + auto fldX = model->MakeField("x", 1); + auto ntpl = RNTupleWriter::Recreate(std::move(model), ntpl_names[i], filenames[i]); + for (auto j = 0; j < 100; j++) + { + if (j % every_n_entries == 0) + { + ntpl->CommitCluster(); + } + ntpl->Fill(); + } + } +} + +void create_cloned_actions() +{ + std::vector clusters{ + 66, 976, 1542, 1630, 2477, 3566, 4425, 4980, 5109, 5381, 5863, 6533, 6590, 6906, 8312, 8361, 8900, 8952, 9144, 9676}; + std::string datasetname{"Events"}; + std::string filename{"distrdf_roottest_check_cloned_actions_asnumpy.root"}; + + auto model = RNTupleModel::Create(); + auto fldEv = model->MakeField("event"); + auto ntpl = RNTupleWriter::Recreate(std::move(model), datasetname, filename); + for (auto i = 0; i < 10000; i++) + { + *fldEv = i; + // Flush a cluster of entries at the defined cluster boundaries + if (std::find(clusters.begin(), clusters.end(), i) != clusters.end()) + { + ntpl->CommitCluster(); + } + ntpl->Fill(); + } +} + +void create_empty_rntuple() +{ + auto ntpl = RNTupleWriter::Recreate(RNTupleModel::Create(), "empty", "empty.root"); +} + +void create_definepersample() +{ + std::vector filenames{ + "distrdf_roottest_definepersample_sample1.root", + "distrdf_roottest_definepersample_sample2.root", + "distrdf_roottest_definepersample_sample3.root"}; + std::string ntpl_name = "Events"; + for (const auto &fn : filenames) + { + auto model = RNTupleModel::Create(); + auto fldX = model->MakeField("x"); + auto ntpl = RNTupleWriter::Recreate(std::move(model), ntpl_name, fn); + for (ULong64_t entry = 0; entry < 10; entry++) + { + *fldX = entry; + ntpl->Fill(); + } + } +} + +void create_friend_trees_alignment() +{ + std::vector ntpl_names{ + "distrdf_roottest_check_friend_trees_alignment_1", + "distrdf_roottest_check_friend_trees_alignment_2", + "distrdf_roottest_check_friend_trees_alignment_3", + "distrdf_roottest_check_friend_trees_alignment_4", + "distrdf_roottest_check_friend_trees_alignment_5", + "distrdf_roottest_check_friend_trees_alignment_6"}; + std::vector filenames{ + "distrdf_roottest_check_friend_trees_alignment_1.root", + "distrdf_roottest_check_friend_trees_alignment_2.root", + "distrdf_roottest_check_friend_trees_alignment_3.root", + "distrdf_roottest_check_friend_trees_alignment_4.root", + "distrdf_roottest_check_friend_trees_alignment_5.root", + "distrdf_roottest_check_friend_trees_alignment_6.root"}; + std::vector> limits{ + {0, 10}, + {10, 20}, + {20, 30}, + {30, 40}, + {40, 50}, + {50, 60}, + }; + for (auto i = 0; i < limits.size(); i++) + { + auto model = RNTupleModel::Create(); + auto fldX = model->MakeField("x"); + auto ntpl = RNTupleWriter::Recreate(std::move(model), ntpl_names[i], filenames[i]); + for (ULong64_t entry = limits[i].first; entry < limits[i].second; entry++) + { + *fldX = entry; + ntpl->CommitCluster(); + ntpl->Fill(); + } + } +} + +void create_friend_trees() +{ + auto create_ntpl = [](const std::string &ntplname, const std::string &filename, int gaus_mean) + { + auto model = RNTupleModel::Create(); + auto fldX = model->MakeField("x"); + auto ntpl = RNTupleWriter::Recreate(std::move(model), ntplname, filename); + TRandom r; + for (auto i = 0; i < 10000; i++) + { + *fldX = r.Gaus(gaus_mean, 1); + ntpl->Fill(); + } + }; + + std::string main_ntplname = "T"; + std::string friend_ntplname = "TF"; + auto main_mean{10}; + auto friend_mean{20}; + std::string main_filename = "distrdf_roottest_check_friend_trees_main.root"; + std::string friend_filename = "distrdf_roottest_check_friend_trees_friend.root"; + + create_ntpl(main_ntplname, main_filename, main_mean); + create_ntpl(friend_ntplname, friend_filename, friend_mean); + + // 7584 + std::string ntpl_name_rn1 = "randomNumbers"; + std::string ntpl_name_rn2 = "randomNumbersBis"; + std::string filename_7584 = "distrdf_roottest_check_friend_trees_7584.root"; + TFile out_file{filename_7584.c_str(), "recreate"}; + { + auto model = RNTupleModel::Create(); + auto fldX = model->MakeField("x"); + auto ntpl = RNTupleWriter::Append(std::move(model), ntpl_name_rn1, out_file); + TRandom r; + for (auto i = 0; i < 10000; i++) + { + *fldX = r.Gaus(main_mean, 1); + ntpl->Fill(); + } + } + { + auto model = RNTupleModel::Create(); + auto fldX = model->MakeField("x"); + auto ntpl = RNTupleWriter::Append(std::move(model), ntpl_name_rn2, out_file); + TRandom r; + for (auto i = 0; i < 10000; i++) + { + *fldX = r.Gaus(friend_mean, 1); + ntpl->Fill(); + } + } +} + +void create_reducer_merge() +{ + std::string ntpl_name = "tree"; + std::string filename = "distrdf_roottest_check_reducer_merge_1.root"; + auto model = RNTupleModel::Create(); + auto fldV = model->MakeField("v"); + auto ntpl = RNTupleWriter::Recreate(std::move(model), ntpl_name, filename); + for (auto i = 0; i < 100; i++) + { + *fldV = static_cast(i); + ntpl->Fill(); + } +} + +void create_rungraphs() +{ + std::string ntpl_name = "tree"; + std::string filename = "distrdf_roottest_check_rungraphs.root"; + auto model = RNTupleModel::Create(); + auto fldb1 = model->MakeField("b1", 42); + auto fldb2 = model->MakeField("b2", 42); + auto fldb3 = model->MakeField("b3", 42); + auto ntpl = RNTupleWriter::Recreate(std::move(model), ntpl_name, filename); + auto nentries = 10000; + auto every_n_entries = 5000; + for (auto i = 0; i < nentries; i++) + { + if (i % every_n_entries == 0) + { + ntpl->CommitCluster(); + } + ntpl->Fill(); + } +} + +void _create_datasets() +{ + create_check_backend(); + create_cloned_actions(); + create_empty_rntuple(); + create_definepersample(); + create_friend_trees_alignment(); + create_friend_trees(); + create_reducer_merge(); + create_rungraphs(); +} \ No newline at end of file diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_backend_0.root b/python/distrdf/data/rntuple/distrdf_roottest_check_backend_0.root new file mode 100644 index 0000000000..731fa9bfba Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_backend_0.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_backend_1.root b/python/distrdf/data/rntuple/distrdf_roottest_check_backend_1.root new file mode 100644 index 0000000000..606a07d9b3 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_backend_1.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_backend_2.root b/python/distrdf/data/rntuple/distrdf_roottest_check_backend_2.root new file mode 100644 index 0000000000..a7b17891dd Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_backend_2.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_cloned_actions_asnumpy.root b/python/distrdf/data/rntuple/distrdf_roottest_check_cloned_actions_asnumpy.root new file mode 100644 index 0000000000..47c18db8f8 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_cloned_actions_asnumpy.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_7584.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_7584.root new file mode 100644 index 0000000000..eb4648e024 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_7584.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_1.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_1.root new file mode 100644 index 0000000000..8dbf025f97 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_1.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_2.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_2.root new file mode 100644 index 0000000000..a7c405a20a Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_2.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_3.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_3.root new file mode 100644 index 0000000000..2d5aa61f9a Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_3.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_4.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_4.root new file mode 100644 index 0000000000..23364e53fe Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_4.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_5.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_5.root new file mode 100644 index 0000000000..26d76e3d07 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_5.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_6.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_6.root new file mode 100644 index 0000000000..01fa16cc6a Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_alignment_6.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_friend.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_friend.root new file mode 100644 index 0000000000..11b55b8618 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_friend.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_main.root b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_main.root new file mode 100644 index 0000000000..6cd1b9e1b5 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_friend_trees_main.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_reducer_merge_1.root b/python/distrdf/data/rntuple/distrdf_roottest_check_reducer_merge_1.root new file mode 100644 index 0000000000..e28cb991b2 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_reducer_merge_1.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_check_rungraphs.root b/python/distrdf/data/rntuple/distrdf_roottest_check_rungraphs.root new file mode 100644 index 0000000000..526fdd96d9 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_check_rungraphs.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample1.root b/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample1.root new file mode 100644 index 0000000000..53b73a94b6 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample1.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample2.root b/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample2.root new file mode 100644 index 0000000000..df15ae9f99 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample2.root differ diff --git a/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample3.root b/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample3.root new file mode 100644 index 0000000000..ec585fd911 Binary files /dev/null and b/python/distrdf/data/rntuple/distrdf_roottest_definepersample_sample3.root differ diff --git a/python/distrdf/data/rntuple/empty.root b/python/distrdf/data/rntuple/empty.root new file mode 100644 index 0000000000..0b5f6adcc0 Binary files /dev/null and b/python/distrdf/data/rntuple/empty.root differ