From b9c8216c5c99f451df6438ba0af29492e81c3fcb Mon Sep 17 00:00:00 2001
From: Jim Bosch <jbosch@astro.princeton.edu>
Date: Fri, 13 Sep 2024 16:56:01 -0400
Subject: [PATCH] Fix collection glob detection in query-datasets.

CHAINED collections could masquerade as globs with the previous logic.
With the new logic, we're probably trying to turn each string into a
regex more than one time, but that shouldn't be very expensive.
---
 doc/changes/DM-46339.bugfix.md                 |  1 +
 python/lsst/daf/butler/script/queryDatasets.py |  3 ++-
 tests/test_cliCmdQueryDatasets.py              | 15 ++++++++++++---
 3 files changed, 15 insertions(+), 4 deletions(-)
 create mode 100644 doc/changes/DM-46339.bugfix.md

diff --git a/doc/changes/DM-46339.bugfix.md b/doc/changes/DM-46339.bugfix.md
new file mode 100644
index 0000000000..ac6bc9184a
--- /dev/null
+++ b/doc/changes/DM-46339.bugfix.md
@@ -0,0 +1 @@
+Fix a bug in `butler query-datasets` that incorrectly rejected a find-first query against a chain collection as having a glob.
diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py
index 15edb3c365..36058d4852 100644
--- a/python/lsst/daf/butler/script/queryDatasets.py
+++ b/python/lsst/daf/butler/script/queryDatasets.py
@@ -36,6 +36,7 @@
 
 from .._butler import Butler
 from ..cli.utils import sortAstropyTable
+from ..utils import has_globs
 
 if TYPE_CHECKING:
     from lsst.daf.butler import DatasetRef
@@ -261,7 +262,7 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:
             summary_datasets=dataset_types,
         )
         expanded_query_collections = [c.name for c in query_collections_info]
-        if self._find_first and set(query_collections) != set(expanded_query_collections):
+        if self._find_first and has_globs(query_collections):
             raise RuntimeError("Can not use wildcards in collections when find_first=True")
         query_collections = expanded_query_collections
 
diff --git a/tests/test_cliCmdQueryDatasets.py b/tests/test_cliCmdQueryDatasets.py
index 4e7f41e8a2..44ded271d7 100644
--- a/tests/test_cliCmdQueryDatasets.py
+++ b/tests/test_cliCmdQueryDatasets.py
@@ -32,7 +32,7 @@
 import unittest
 
 from astropy.table import Table as AstropyTable
-from lsst.daf.butler import StorageClassFactory, script
+from lsst.daf.butler import CollectionType, StorageClassFactory, script
 from lsst.daf.butler.tests import addDatasetType
 from lsst.daf.butler.tests.utils import ButlerTestHelper, MetricTestRepo, makeTestTempDir, removeTestTempDir
 from lsst.resources import ResourcePath
@@ -370,8 +370,13 @@ def testFindFirstAndCollections(self):
         # Add a new run, and add a dataset to shadow an existing dataset.
         testRepo.addDataset(run="foo", dataId={"instrument": "DummyCamComp", "visit": 424})
 
+        # Add a CHAINED collection to include the two runs, to check that
+        # flattening works as well.
+        testRepo.butler.collections.register("chain", CollectionType.CHAINED)
+        testRepo.butler.collections.redefine_chain("chain", ["foo", "ingest/run"])
+
         # Verify that without find-first, duplicate datasets are returned
-        tables = self._queryDatasets(repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True)
+        tables = self._queryDatasets(repo=testRepo.butler, collections=["chain"], show_uri=True)
 
         # The test should be running with a single FileDatastore.
         roots = testRepo.butler.get_datastore_roots()
@@ -514,7 +519,7 @@ def testFindFirstAndCollections(self):
         # Verify that with find first the duplicate dataset is eliminated and
         # the more recent dataset is returned.
         tables = self._queryDatasets(
-            repo=testRepo.butler, collections=["foo", "ingest/run"], show_uri=True, find_first=True
+            repo=testRepo.butler, collections=["chain"], show_uri=True, find_first=True
         )
 
         expectedTables = (
@@ -614,6 +619,10 @@ def testFindFirstAndCollections(self):
 
         self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True)
 
+        # Verify that globs are not supported with find_first=True.
+        with self.assertRaises(RuntimeError):
+            self._queryDatasets(repo=testRepo.butler, collections=["*"], show_uri=True, find_first=True)
+
 
 if __name__ == "__main__":
     unittest.main()