Merge pull request #1079 from lsst/tickets/DM-46340

DM-46340: Fix query datasets failing when run collection specified
lsst · Sep 16, 2024 · 4a59ea5 · 4a59ea5
2 parents da89c37 + 1582b9e
commit 4a59ea5
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 2 deletions.
diff --git a/doc/changes/DM-46430.bugfix.md b/doc/changes/DM-46430.bugfix.md
@@ -0,0 +1 @@
+Fix an issue where `query_datasets` would sometimes fail when searching in a single run collection.
diff --git a/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py b/python/lsst/daf/butler/registry/datasets/byDimensions/_storage.py
@@ -674,7 +674,13 @@ def _finish_query_builder(
             only_collection_record = collections[0]
             sql_projection.joiner.where(collection_col == only_collection_record.key)
             if "collection" in fields:
-                fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name)
+                fields_provided["collection"] = sqlalchemy.literal(only_collection_record.name).cast(
+                    # This cast is necessary to ensure that Postgres knows the
+                    # type of this column if it is used in an aggregate
+                    # function.
+                    sqlalchemy.String
+                )
+
         elif not collections:
             sql_projection.joiner.where(sqlalchemy.literal(False))
             if "collection" in fields:
@@ -710,7 +716,13 @@ def _finish_query_builder(
                 # know that if we find the dataset in that collection,
                 # then that's the datasets's run; we don't need to
                 # query for it.
-                fields_provided["run"] = sqlalchemy.literal(only_collection_record.name)
+                #
+                fields_provided["run"] = sqlalchemy.literal(only_collection_record.name).cast(
+                    # This cast is necessary to ensure that Postgres knows the
+                    # type of this column if it is used in an aggregate
+                    # function.
+                    sqlalchemy.String
+                )
             elif run_collections_only:
                 # Once again we can avoid joining to the collection table by
                 # adding a CASE statement.

diff --git a/python/lsst/daf/butler/tests/butler_queries.py b/python/lsst/daf/butler/tests/butler_queries.py
@@ -1819,6 +1819,46 @@ def test_collection_query_info(self) -> None:
         assert dataset_types is not None
         self.assertCountEqual(dataset_types, ["flat"])
 
+    def test_dataset_queries(self) -> None:
+        butler = self.make_butler("base.yaml", "spatial.yaml")
+
+        # Need a dataset with some spatial information to trigger aggregate
+        # value logic in queries.
+        butler.registry.registerDatasetType(
+            DatasetType("dt", ["visit", "detector"], "int", universe=butler.dimensions)
+        )
+        butler.collections.register("run1")
+        butler.registry.insertDatasets("dt", [{"instrument": "Cam1", "visit": 1, "detector": 1}], "run1")
+
+        # Tests for a regression of DM-46340, where invalid SQL would be
+        # generated when the list of collections is a single run collection and
+        # there is region-postprocessing logic involved.  This was due to
+        # missing type information associated with the "run" dataset field.
+        result = butler.query_datasets(
+            "dt",
+            "run1",
+            where="instrument='Cam1' and skymap='SkyMap1' and visit=1 and tract=0",
+            with_dimension_records=True,
+        )
+        self.assertEqual(result[0].dataId, {"instrument": "Cam1", "visit": 1, "detector": 1})
+        self.assertEqual(result[0].run, "run1")
+
+        # A similar issue to the "run" issue above was occuring with the
+        # 'collection' dataset field.
+        with butler.query() as query:
+            rows = list(
+                query.join_dataset_search("dt", "run1")
+                .where("instrument='Cam1' and skymap='SkyMap1' and visit=1 and tract=0")
+                .general(
+                    dimensions=["visit", "detector"],
+                    dataset_fields={"dt": set(["collection"])},
+                    find_first=True,
+                )
+            )
+            self.assertEqual(len(rows), 1)
+            self.assertEqual(rows[0]["visit"], 1)
+            self.assertEqual(rows[0]["dt.collection"], "run1")
+
 
 def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[DimensionRecord]) -> list[int]:
     output = []