Skip to content

Commit

Permalink
Remove order_by from query_all_datasets
Browse files Browse the repository at this point in the history
The upcoming implementation of query_all_datasets will not support order_by, so remove it.  This requires modifying the query-datasets CLI to use the single dataset type query_datasets when order by needs to be supported.
  • Loading branch information
dhirving committed Nov 5, 2024
1 parent 0a27338 commit 293882f
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 22 deletions.
11 changes: 1 addition & 10 deletions python/lsst/daf/butler/_query_all_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from __future__ import annotations

import logging
from collections.abc import Iterable, Iterator, Mapping, Sequence
from collections.abc import Iterable, Iterator, Mapping
from typing import Any, NamedTuple

from lsst.utils.iteration import ensure_iterable
Expand Down Expand Up @@ -61,7 +61,6 @@ def query_all_datasets(
where: str = "",
bind: Mapping[str, Any] | None = None,
limit: int | None = None,
order_by: Sequence[str] | None = None,
**kwargs: Any,
) -> Iterator[DatasetsPage]:
"""Query for dataset refs from multiple types simultaneously.
Expand Down Expand Up @@ -103,11 +102,6 @@ def query_all_datasets(
Upper limit on the number of returned records. `None` can be used
if no limit is wanted. A limit of ``0`` means that the query will
be executed and validated but no results will be returned.
order_by : `~collections.abc.Iterable` [`str`] or `str`, optional
Names of the columns/dimensions to use for ordering returned data
IDs. Column name can be prefixed with minus (``-``) to use
descending ordering. Results are ordered only within each dataset
type, they are not globally ordered across all results.
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``data_id``
Expand All @@ -130,8 +124,6 @@ def query_all_datasets(
"""
if find_first and has_globs(collections):
raise InvalidQueryError("Can not use wildcards in collections when find_first=True")
if order_by is None:
order_by = []
if data_id is None:
data_id = {}

Expand All @@ -144,7 +136,6 @@ def query_all_datasets(
query.datasets(dt, filtered_collections, find_first=find_first)
.where(data_id, where, kwargs, bind=bind)
.limit(limit)
.order_by(*order_by)
)

for page in results._iter_pages():
Expand Down
59 changes: 47 additions & 12 deletions python/lsst/daf/butler/script/queryDatasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import warnings
from collections import defaultdict
from collections.abc import Iterable, Iterator
from contextlib import contextmanager
from itertools import groupby
from typing import TYPE_CHECKING

Expand Down Expand Up @@ -206,7 +207,8 @@ def __init__(
)
glob = ["*"]

if order_by and (len(glob) > 1 or has_globs(glob)):
searches_multiple_dataset_types = len(glob) > 1 or has_globs(glob)
if order_by and searches_multiple_dataset_types:
raise NotImplementedError("--order-by is only supported for queries with a single dataset type.")

# show_uri requires a datastore.
Expand All @@ -219,6 +221,7 @@ def __init__(
self._find_first = find_first
self._limit = limit
self._order_by = order_by
self._searches_multiple_dataset_types = searches_multiple_dataset_types

def getTables(self) -> Iterator[AstropyTable]:
"""Get the datasets as a list of astropy tables.
Expand Down Expand Up @@ -279,18 +282,17 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:
else:
limit = self._limit

# We want to allow --order-by, but the query backend only supports
# it when there is a single dataset type.
# So we have to select different backends for single vs multiple
# dataset type queries.
if self._searches_multiple_dataset_types:
query_func = self._query_multiple_dataset_types
else:
query_func = self._query_single_dataset_type

try:
with self.butler.query() as query:
pages = query_all_datasets(
self.butler,
query,
collections=query_collections,
find_first=self._find_first,
name=self._dataset_type_glob,
where=self._where,
limit=limit,
order_by=self._order_by,
)
with query_func(query_collections, limit) as pages:
datasets_found = 0
for dataset_type, refs in _chunk_by_dataset_type(pages):
datasets_found += len(refs)
Expand All @@ -314,6 +316,39 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:
limit - 1,
)

@contextmanager
def _query_multiple_dataset_types(
self, collections: list[str], limit: int | None
) -> Iterator[Iterator[DatasetsPage]]:
with self.butler.query() as query:
yield query_all_datasets(
self.butler,
query,
collections=collections,
find_first=self._find_first,
name=self._dataset_type_glob,
where=self._where,
limit=limit,
)

@contextmanager
def _query_single_dataset_type(
self, collections: list[str], limit: int | None
) -> Iterator[Iterator[DatasetsPage]]:
assert len(self._dataset_type_glob) == 1
dataset_type = self._dataset_type_glob[0]

refs = self.butler.query_datasets(
dataset_type,
collections=collections,
find_first=self._find_first,
where=self._where,
limit=limit,
order_by=self._order_by,
)

yield iter([DatasetsPage(dataset_type=dataset_type, data=refs)])


def _chunk_by_dataset_type(pages: Iterator[DatasetsPage]) -> Iterator[DatasetsPage]:
# For each dataset type in the results, collect all the refs into a
Expand Down

0 comments on commit 293882f

Please sign in to comment.