From b9c8216c5c99f451df6438ba0af29492e81c3fcb Mon Sep 17 00:00:00 2001 From: Jim Bosch Date: Fri, 13 Sep 2024 16:56:01 -0400 Subject: [PATCH] Fix collection glob detection in query-datasets. CHAINED collections could masquerade as globs with the previous logic. With the new logic, we're probably trying to turn each string into a regex more than one time, but that shouldn't be very expensive. --- doc/changes/DM-46339.bugfix.md | 1 + python/lsst/daf/butler/script/queryDatasets.py | 3 ++- tests/test_cliCmdQueryDatasets.py | 15 ++++++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 doc/changes/DM-46339.bugfix.md diff --git a/doc/changes/DM-46339.bugfix.md b/doc/changes/DM-46339.bugfix.md new file mode 100644 index 0000000000..ac6bc9184a --- /dev/null +++ b/doc/changes/DM-46339.bugfix.md @@ -0,0 +1 @@ +Fix a bug in `butler query-datasets` that incorrectly rejected a find-first query against a chain collection as having a glob. diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index 15edb3c365..36058d4852 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -36,6 +36,7 @@ from .._butler import Butler from ..cli.utils import sortAstropyTable +from ..utils import has_globs if TYPE_CHECKING: from lsst.daf.butler import DatasetRef @@ -261,7 +262,7 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]: summary_datasets=dataset_types, ) expanded_query_collections = [c.name for c in query_collections_info] - if self._find_first and set(query_collections) != set(expanded_query_collections): + if self._find_first and has_globs(query_collections): raise RuntimeError("Can not use wildcards in collections when find_first=True") query_collections = expanded_query_collections diff --git a/tests/test_cliCmdQueryDatasets.py b/tests/test_cliCmdQueryDatasets.py index 4e7f41e8a2..44ded271d7 100644 --- a/tests/test_cliCmdQueryDatasets.py +++ b/tests/test_cliCmdQueryDatasets.py @@ -32,7 +32,7 @@ import unittest from astropy.table import Table as AstropyTable -from lsst.daf.butler import StorageClassFactory, script +from lsst.daf.butler import CollectionType, StorageClassFactory, script from lsst.daf.butler.tests import addDatasetType from lsst.daf.butler.tests.utils import ButlerTestHelper, MetricTestRepo, makeTestTempDir, removeTestTempDir from lsst.resources import ResourcePath @@ -370,8 +370,13 @@ def testFindFirstAndCollections(self): # Add a new run, and add a dataset to shadow an existing dataset. testRepo.addDataset(run="foo", dataId={"instrument": "DummyCamComp", "visit": 424}) + # Add a CHAINED collection to include the two runs, to check that + # flattening works as well. + testRepo.butler.collections.register("chain", CollectionType.CHAINED) + testRepo.butler.collections.redefine_chain("chain", ["foo", "ingest/run"]) + # Verify that without find-first, duplicate datasets are returned - tables = self._queryDatasets(repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True) + tables = self._queryDatasets(repo=testRepo.butler, collections=["chain"], show_uri=True) # The test should be running with a single FileDatastore. roots = testRepo.butler.get_datastore_roots() @@ -514,7 +519,7 @@ def testFindFirstAndCollections(self): # Verify that with find first the duplicate dataset is eliminated and # the more recent dataset is returned. tables = self._queryDatasets( - repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True, find_first=True + repo=testRepo.butler, collections=["chain"], show_uri=True, find_first=True ) expectedTables = ( @@ -614,6 +619,10 @@ def testFindFirstAndCollections(self): self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True) + # Verify that globs are not supported with find_first=True. + with self.assertRaises(RuntimeError): + self._queryDatasets(repo=testRepo.butler, collections=["*"], show_uri=True, find_first=True) + if __name__ == "__main__": unittest.main()