Skip to content

Commit

Permalink
Sketch out new interfaces for querying multiple dataset types.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Aug 21, 2024
1 parent 201ee96 commit 5396bb0
Show file tree
Hide file tree
Showing 7 changed files with 650 additions and 10 deletions.
10 changes: 7 additions & 3 deletions python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,16 @@
# Some components are not auto-imported since they can have additional runtime
# dependencies.

from . import logging # most symbols are helpers only
from . import progress # most symbols are only used by handler implementors
from . import ddl, time_utils
from . import (
ddl,
logging, # most symbols are helpers only
progress, # most symbols are only used by handler implementors
time_utils,
)
from ._butler import *
from ._butler_collections import *
from ._butler_config import *
from ._butler_dataset_types import *
from ._butler_repo_index import *
from ._collection_type import CollectionType
from ._column_categorization import *
Expand Down
103 changes: 98 additions & 5 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

from ._butler_collections import ButlerCollections
from ._butler_config import ButlerConfig, ButlerType
from ._butler_dataset_types import ButlerDatasetTypes
from ._butler_instance_options import ButlerInstanceOptions
from ._butler_repo_index import ButlerRepoIndex
from ._config import Config, ConfigSubset
Expand Down Expand Up @@ -840,6 +841,7 @@ def getURI(
)
return primary

# TODO: RFC deprecating this in favor of butler.dataset_types.get.
@abstractmethod
def get_dataset_type(self, name: str) -> DatasetType:
"""Get the `DatasetType`.
Expand Down Expand Up @@ -1448,6 +1450,16 @@ def run(self) -> str | None:
"""
raise NotImplementedError()

# TODO: make this abstract and implement in derived classes.
@property
def dataset_types(self) -> ButlerDatasetTypes:
"""Object with methods for modifying and querying dataset types
(`~lsst.daf.butler.ButlerDatasettypes`).
Use of this object is preferred over `registry` wherever possible.
"""
raise NotImplementedError()

@property
@abstractmethod
def registry(self) -> Registry:
Expand Down Expand Up @@ -1572,22 +1584,20 @@ def _query_datasets(
explain: bool = True,
**kwargs: Any,
) -> list[DatasetRef]:
"""Query for dataset references matching user-provided criteria.
"""Query for dataset references of a single dataset type.
Parameters
----------
dataset_type : `str` or `DatasetType`
Dataset type object or name to search for.
collections : collection expression, optional
A collection name or iterable of collection names to search. If not
provided, the default collections are used. See
:ref:`daf_butler_collection_expressions` for more information.
provided, the default collections are used.
find_first : `bool`, optional
If `True` (default), for each result data ID, only yield one
`DatasetRef` of each `DatasetType`, from the first collection in
which a dataset of that dataset type appears (according to the
order of ``collections`` passed in). If `True`, ``collections``
must not contain regular expressions and may not be ``...``.
order of ``collections`` passed in).
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints in
the query.
Expand Down Expand Up @@ -1739,6 +1749,89 @@ def _query_dimension_records(
raise EmptyQueryResultError(list(result.explain_no_results()))
return dimension_records

def _query_all_datasets(
self,
collections: str | Iterable[str] | None = None,
*,
name: str | Iterable[str] = "*",
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
is_calibration: bool | None = None,
find_first: bool = True,
data_id: DataId | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
explain: bool = True,
**kwargs: Any,
) -> list[DatasetRef]:
"""Query for datasets of potentially multiple types.
Parameters
----------
collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
The collection or collections to search, in order. If not provided
or `None`, the default collection search path for this butler is
used.
name : `str` or `~collections.abc.Iterable` [ `str` ], optional
Names or name patterns (glob-style) that returned dataset type
names must match. If an iterable, items are OR'd together. The
default is to include all dataset types in the given collections.
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions that returned dataset types must have as a subset.
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions that returned dataset types must have exactly.
with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
or `StorageClass` or \
`~collections.abc.Iterable` [ `StorageClass` ], optional
Storage classes or storage class names that returned dataset types
must have. If an iterable, items are OR'd together.
is_calibration : `bool` or `None`, optional
If `None`, constrain returned dataset types to be or not be
calibrations.
find_first : `bool`, optional
If `True` (default), for each result data ID, only yield one
`DatasetRef` of each `DatasetType`, from the first collection in
which a dataset of that dataset type appears (according to the
order of ``collections`` passed in).
data_id : `dict` or `DataCoordinate`, optional
A data ID whose key-value pairs are used as equality constraints in
the query.
where : `str`, optional
A string expression similar to a SQL WHERE clause. May involve any
column of a dimension table or (as a shortcut for the primary key
column of a dimension table) dimension name. See
:ref:`daf_butler_dimension_expressions` for more information.
bind : `~collections.abc.Mapping`, optional
Mapping containing literal values that should be injected into the
``where`` expression, keyed by the identifiers they replace. Values
of collection type can be expanded in some cases; see
:ref:`daf_butler_dimension_expressions_identifiers` for more
information.
explain : `bool`, optional
If `True` (default) then `EmptyQueryResultError` exception is
raised when resulting list is empty. The exception contains
non-empty list of strings explaining possible causes for empty
result.
**kwargs
Additional keyword arguments are forwarded to
`DataCoordinate.standardize` when processing the ``data_id``
argument (and may be used to provide a constraining data ID even
when the ``data_id`` argument is `None`).
Returns
-------
refs : `list` [ `DatasetRef` ]
Dataset references matching the given query criteria. Nested data
IDs are guaranteed to include values for all implied dimensions
(i.e. `DataCoordinate.hasFull` will return `True`), but will not
include dimension records (`DataCoordinate.hasRecords` will be
`False`).
"""
raise NotImplementedError()

@abstractmethod
def _clone(
self,
Expand Down
220 changes: 220 additions & 0 deletions python/lsst/daf/butler/_butler_dataset_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("ButlerDatasetTypes",)

from abc import ABC, abstractmethod
from collections.abc import Iterable, Sequence, Set

Check failure on line 33 in python/lsst/daf/butler/_butler_dataset_types.py

View workflow job for this annotation

GitHub Actions / call-workflow / lint

F401

'collections.abc.Set' imported but unused
from typing import Any, overload

Check failure on line 34 in python/lsst/daf/butler/_butler_dataset_types.py

View workflow job for this annotation

GitHub Actions / call-workflow / lint

F401

'typing.Any' imported but unused

Check failure on line 34 in python/lsst/daf/butler/_butler_dataset_types.py

View workflow job for this annotation

GitHub Actions / call-workflow / lint

F401

'typing.overload' imported but unused

from pydantic import BaseModel

Check failure on line 36 in python/lsst/daf/butler/_butler_dataset_types.py

View workflow job for this annotation

GitHub Actions / call-workflow / lint

F401

'pydantic.BaseModel' imported but unused

from ._dataset_type import DatasetType
from ._storage_class import StorageClass
from .dimensions import DimensionGroup


class ButlerDatasetTypes(ABC, Sequence):
"""Methods for working with the dataset types known to the Butler."""

@abstractmethod
def get(self, name: str) -> DatasetType:
"""Return the dataset type with the given name.
Returns
-------
dataset_type : `DatasetType`
Dataset type object with the given name.
Raises
------
MissingDatasetTypeError
Raised if there is no dataset type with the given name.
"""
raise NotImplementedError()

@abstractmethod
def query(
self,
name: str | Iterable[str],
*,
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
is_calibration: bool | None = None,
) -> Iterable[DatasetType]:
"""Query for dataset types matching the given criteria.
Parameters
----------
name : `str` or `~collections.abc.Iterable` [ `str` ]
Names or name patterns (glob-style) that returned dataset type
names must match. If an iterable, items are OR'd together.
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions that returned dataset types must have as a subset.
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions that returned dataset types must have exactly.
with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
or `StorageClass` or \
`~collections.abc.Iterable` [ `StorageClass` ], optional
Storage classes or storage class names that returned dataset types
must have. If an iterable, items are OR'd together.
is_calibration : `bool` or `None`, optional
If `None`, constrain returned dataset types to be or not be
calibrations.
Returns
-------
dataset_types : `~collections.abc.Iterable` [ `DatasetType`
An iterable of dataset types. This is guaranteed to be a regular
Python in-memory container, not a lazy single-pass iterator, but
the type of container is currently left unspecified in order to
leave room for future convenience behavior.
Notes
-----
This method queries all registered dataset types in registry. To query
for the types of datasets that are in a collection, instead use::
info = butler.collections.query_info(
collections,
include_summaries=True,
)
for a simple summary of the dataset types in each collection (see
`lsst.daf.butler.ButlerCollections.query_info`). Or, for
more complex but powerful queries (including constraints on data IDs or
dataset counts), use::
with butler.query() as q:
dataset_types = q.dataset_types(collections)
See `lsst.daf.butler.queries.Query.dataset_types` for details.
"""
raise NotImplementedError()

@abstractmethod
def query_names(
self,
name: str | Iterable[str],
*,
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
is_calibration: bool | None = None,
) -> Iterable[str]:
"""Query for the names of dataset types matching the given criteria.
See `query` for parameter descriptions.
"""
raise NotImplementedError()

@abstractmethod
def register(
self,
name_or_type: str,
/,
dimensions: Iterable[str] | DimensionGroup | None = None,
storage_class: str | StorageClass | None = None,
is_calibration: bool | None = None,
) -> bool:
"""Register a dataset type.
It is not an error to register the same `DatasetType` twice.
Parameters
----------
name_or_type : `str` or `DatasetType`
The name of the dataset type to be added, or a complete
`DatasetType` type object to add.
dimensions : `~colletions.abc.Iterable` [ `str` ] or `DimensionGroup`,\
optional
Dimensions for the dataset type. Required if the first argument
is just a `str`, and overrides the dimensions if the first argument
is a `DatasetType`.
storage_class : `str` or `StorageClass`, optional
Storage class for the dataset type. Required if the first argument
is just a `str`, and overrides the storage class if the first
arguemnt is a `DatasetType`.
is_calibration: `bool`, optional
Whether the dataset type is a calibration. If the first argument
is a `str`, defaults to `False`. If the first argument is a
`DatasetType` and this argument is not `None`, it overrides the
value on the `DatasetType`.
Returns
-------
inserted : `bool`
`True` if a new dataset type was inserted, `False` if an identical
existing dataset type was found. Note that in either case the
dataset type is guaranteed to be defined in the repository
consistently with the given definition.
Raises
------
ValueError
Raised if the dimensions or storage class are invalid.
lsst.daf.butler.registry.ConflictingDefinitionError
Raised if this dataset type is already registered with a different
definition.
"""
raise NotImplementedError()

@abstractmethod
def remove(self, name: str) -> None:
"""Remove the dataset type with the given name.
.. warning::
Butler implementations can cache the dataset type definitions.
This means that deleting the dataset type definition may result in
unexpected behavior from other butler processes that are active
that have not seen the deletion.
Parameters
----------
name : `str` or `tuple` [`str`]
Name of the type to be removed or tuple containing a list of type
names to be removed. Wildcards are allowed.
Raises
------
lsst.daf.butler.registry.OrphanedRecordError
Raised if an attempt is made to remove the dataset type definition
when there are still datasets associated with it.
Notes
-----
If the dataset type is not registered the method will return without
action.
"""
raise NotImplementedError()
2 changes: 2 additions & 0 deletions python/lsst/daf/butler/queries/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from ._base import *
from ._data_coordinate_query_results import *
from ._dataset_query_results import *
from ._dataset_type_results import *
from ._dimension_record_query_results import *
from ._general_query_results import *
from ._heterogeneous_dataset_results import *
from ._query import *
Loading

0 comments on commit 5396bb0

Please sign in to comment.