From 6a4a62b776992faff5e2b5f35da12e53f6fb4ad9 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 11 Oct 2024 10:08:07 -0700 Subject: [PATCH 01/39] Add Butler.retrieve_artifacts_zip() --- python/lsst/daf/butler/_butler.py | 25 ++++ python/lsst/daf/butler/_quantum_backed.py | 27 +++- .../lsst/daf/butler/datastore/_datastore.py | 15 ++- .../daf/butler/datastores/chainedDatastore.py | 32 +++-- .../daf/butler/datastores/fileDatastore.py | 24 +++- .../file_datastore/retrieve_artifacts.py | 116 +++++++++++++++++- .../butler/datastores/inMemoryDatastore.py | 3 +- .../butler/direct_butler/_direct_butler.py | 11 +- .../butler/remote_butler/_remote_butler.py | 31 ++++- python/lsst/daf/butler/tests/hybrid_butler.py | 7 ++ 10 files changed, 265 insertions(+), 26 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index c06935bea0..9593c13f1b 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -987,6 +987,31 @@ def find_dataset( """ raise NotImplementedError() + @abstractmethod + def retrieve_artifacts_zip( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + ) -> ResourcePath: + """Retrieve artifacts from a Butler and place in ZIP file. + + Parameters + ---------- + refs : `collections.abc.Iterable` [ `DatasetRef` ] + The datasets to be included in the zip file. Must all be from + the same dataset type. + destination : `lsst.resources.ResourcePathExpression` + Directory to write the new ZIP file. This directory will + also be used as a staging area for the datasets being downloaded + from the datastore. + + Returns + ------- + zip_file : `lsst.resources.ResourcePath` + The path to the new ZIP file. + """ + raise NotImplementedError() + @abstractmethod def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py index b1594efcc8..9176b752d1 100644 --- a/python/lsst/daf/butler/_quantum_backed.py +++ b/python/lsst/daf/butler/_quantum_backed.py @@ -39,7 +39,7 @@ from typing import TYPE_CHECKING, Any import pydantic -from lsst.resources import ResourcePathExpression +from lsst.resources import ResourcePath, ResourcePathExpression from ._butler_config import ButlerConfig from ._config import Config @@ -51,6 +51,7 @@ from ._storage_class import StorageClass, StorageClassFactory from .datastore import Datastore from .datastore.record_data import DatastoreRecordData, SerializedDatastoreRecordData +from .datastores.file_datastore.retrieve_artifacts import retrieve_and_zip from .dimensions import DimensionUniverse from .registry.bridge.monolithic import MonolithicDatastoreRegistryBridgeManager from .registry.databases.sqlite import SqliteDatabase @@ -498,6 +499,30 @@ def pruneDatasets( # Point of no return for removing artifacts self._datastore.emptyTrash() + def retrieve_artifacts_zip( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + ) -> ResourcePath: + """Retrieve artifacts from the graph and place in ZIP file. + + Parameters + ---------- + refs : `collections.abc.Iterable` [ `DatasetRef` ] + The datasets to be included in the zip file. Must all be from + the same dataset type. + destination : `lsst.resources.ResourcePathExpression` + Directory to write the new ZIP file. This directory will + also be used as a staging area for the datasets being downloaded + from the datastore. + + Returns + ------- + zip_file : `lsst.resources.ResourcePath` + The path to the new ZIP file. + """ + return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts) + def extract_provenance_data(self) -> QuantumProvenanceData: """Extract provenance information and datastore records from this butler. diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index 8514aacc24..7e71c81e05 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -55,13 +55,14 @@ from .._file_dataset import FileDataset from .._storage_class import StorageClassFactory from .constraints import Constraints +from .stored_file_info import StoredFileInfo if TYPE_CHECKING: from lsst.resources import ResourcePath, ResourcePathExpression from .. import ddl from .._config_support import LookupKey - from .._dataset_ref import DatasetRef + from .._dataset_ref import DatasetId, DatasetRef from .._dataset_type import DatasetType from .._storage_class import StorageClass from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager @@ -1023,7 +1024,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, - ) -> list[ResourcePath]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the artifacts associated with the supplied refs. Parameters @@ -1051,12 +1052,18 @@ def retrieveArtifacts( targets : `list` of `lsst.resources.ResourcePath` URIs of file artifacts in destination location. Order is not preserved. + artifacts_to_ref_id : `dict` [ `~lsst.resources.ResourcePath`, \ + `list` [ `uuid.UUID` ] ] + Mapping of retrieved artifact path to DatasetRef ID. + artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ + `StoredDatastoreItemInfo` ] + Mapping of retrieved artifact path to datastore record information. Notes ----- For non-file datastores the artifacts written to the destination may not match the representation inside the datastore. For example - a hierarchichal data structure in a NoSQL database may well be stored + a hierarchical data structure in a NoSQL database may well be stored as a JSON file. """ raise NotImplementedError() @@ -1458,7 +1465,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, - ) -> list[ResourcePath]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: raise NotImplementedError("This is a no-op datastore that can not access a real datastore") def remove(self, datasetRef: DatasetRef) -> None: diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index 09432a2c21..02ff472d5c 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -38,7 +38,7 @@ from collections.abc import Callable, Collection, Iterable, Mapping, Sequence from typing import TYPE_CHECKING, Any -from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, FileDataset +from lsst.daf.butler import DatasetId, DatasetRef, DatasetTypeNotSupportedError, FileDataset from lsst.daf.butler.datastore import ( DatasetRefURIs, Datastore, @@ -48,6 +48,7 @@ ) from lsst.daf.butler.datastore.constraints import Constraints from lsst.daf.butler.datastore.record_data import DatastoreRecordData +from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo from lsst.resources import ResourcePath from lsst.utils import doImportType @@ -804,7 +805,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, - ) -> list[ResourcePath]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. Parameters @@ -832,6 +833,12 @@ def retrieveArtifacts( targets : `list` of `lsst.resources.ResourcePath` URIs of file artifacts in destination location. Order is not preserved. + artifacts_to_ref_id : `dict` [ `~lsst.resources.ResourcePath`, \ + `list` [ `uuid.UUID` ] ] + Mapping of retrieved artifact path to `DatasetRef` ID. + artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ + `StoredDatastoreItemInfo` ] + Mapping of retrieved artifact path to datastore record information. """ if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}") @@ -872,18 +879,21 @@ def retrieveArtifacts( # Now do the transfer. targets: list[ResourcePath] = [] + merged_artifacts_to_ref_id: dict[ResourcePath, list[DatasetId]] = {} + merged_artifacts_to_info: dict[ResourcePath, StoredFileInfo] = {} for number, datastore_refs in grouped_by_datastore.items(): - targets.extend( - self.datastores[number].retrieveArtifacts( - datastore_refs, - destination, - transfer=transfer, - preserve_path=preserve_path, - overwrite=overwrite, - ) + retrieved, artifacts_to_ref_id, artifacts_to_info = self.datastores[number].retrieveArtifacts( + datastore_refs, + destination, + transfer=transfer, + preserve_path=preserve_path, + overwrite=overwrite, ) + targets.extend(retrieved) + merged_artifacts_to_ref_id.update(artifacts_to_ref_id) + merged_artifacts_to_info.update(artifacts_to_info) - return targets + return targets, merged_artifacts_to_ref_id, merged_artifacts_to_info def remove(self, ref: DatasetRef) -> None: """Indicate to the datastore that a dataset can be removed. diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index a4d3000e07..2b0de81642 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -1953,7 +1953,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, - ) -> list[ResourcePath]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. Parameters @@ -1981,6 +1981,12 @@ def retrieveArtifacts( targets : `list` of `lsst.resources.ResourcePath` URIs of file artifacts in destination location. Order is not preserved. + artifacts_to_ref_id : `dict` [ `~lsst.resources.ResourcePath`, \ + `list` [ `uuid.UUID` ] ] + Mapping of retrieved artifact path to `DatasetRef` ID. + artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ + `StoredDatastoreItemInfo` ] + Mapping of retrieved artifact path to datastore record information. """ if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}") @@ -1993,21 +1999,30 @@ def retrieveArtifacts( # that will map to the same underlying file transfer. to_transfer: dict[ResourcePath, ResourcePath] = {} + # Retrieve all the records in bulk indexed by ref.id. + records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) + + # One artifact can be used by multiple DatasetRef. + # e.g. DECam. + artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) + artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} for ref in refs: - locations = self._get_dataset_locations_info(ref) - for location, _ in locations: + for info in records[ref.id]: + location = info.file_location(self.locationFactory) source_uri = location.uri target_uri = determine_destination_for_retrieved_artifact( destination, location.pathInStore, preserve_path ) to_transfer[source_uri] = target_uri + artifact_to_ref_id[target_uri].append(ref.id) + artifact_to_info[target_uri] = info # In theory can now parallelize the transfer log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) for source_uri, target_uri in to_transfer.items(): target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) - return list(to_transfer.values()) + return list(to_transfer.values()), artifact_to_ref_id, artifact_to_info def get( self, @@ -2834,6 +2849,7 @@ def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, Datastore dataset_records.setdefault(self._table.name, []).append(info) record_data = DatastoreRecordData(records=records) + print("XXXCX: ", self.name, record_data) return {self.name: record_data} def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 0445a1bd30..ab6cccb45d 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -27,9 +27,35 @@ from __future__ import annotations -__all__ = ("determine_destination_for_retrieved_artifact",) +__all__ = ("determine_destination_for_retrieved_artifact", "retrieve_and_zip", "ZipIndex") +import tempfile +import uuid +import zipfile +from collections.abc import Callable, Iterable + +from lsst.daf.butler import DatasetId, DatasetIdFactory, DatasetRef, SerializedDatasetRef +from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo from lsst.resources import ResourcePath, ResourcePathExpression +from pydantic import BaseModel + + +class ZipIndex(BaseModel): + """Index of a Zip file of Butler datasets. + + A file can be associated with multiple butler datasets and a single + butler dataset can be associated with multiple files. This model + provides the necessary information for ingesting back into a Butler + file datastore. + """ + + refs: list[SerializedDatasetRef] + """The Butler datasets stored in the Zip file.""" + # Can have multiple refs associated with a single file. + ref_map: dict[str, list[uuid.UUID]] + """Mapping of Zip member to one or more dataset UUIDs.""" + info_map: dict[str, SerializedStoredFileInfo] + """Mapping of each Zip member to the associated datastore record.""" def determine_destination_for_retrieved_artifact( @@ -70,3 +96,91 @@ def determine_destination_for_retrieved_artifact( if target_uri.relative_to(destination_directory) is None: raise ValueError(f"File path attempts to escape destination directory: '{source_path}'") return target_uri + + +def retrieve_and_zip( + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + retrieval_callback: Callable[ + [Iterable[DatasetRef], ResourcePath, str, bool, bool], + tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]], + ], +) -> ResourcePath: + """Retrieve artifacts from a Butler and place in ZIP file. + + Parameters + ---------- + refs : `collections.abc.Iterable` [ `DatasetRef` ] + The datasets to be included in the zip file. Must all be from + the same dataset type. + destination : `lsst.resources.ResourcePath` + Directory to write the new ZIP file. This directory will + also be used as a staging area for the datasets being downloaded + from the datastore. + retrieval_callback : `~collections.abc.Callable` + Bound method for a function that can retrieve the artifacts and + return the metadata necessary for creating the zip index. For example + `lsst.daf.butler.datastore.Datastore.retrieveArtifacts`. + + Returns + ------- + zip_file : `lsst.resources.ResourcePath` + The path to the new ZIP file. + """ + outdir = ResourcePath(destination, forceDirectory=True) + if not outdir.isdir(): + raise ValueError(f"Destination location must refer to a directory. Given {destination}") + # Simplest approach: + # - create temp dir in destination + # - Run retrieveArtifacts to that temp dir + # - Create zip file with unique name. + # - Delete temp dir + # - Add index file to ZIP. + # - Return name of zip file. + with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: + tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) + paths, refmap, infomap = retrieval_callback(refs, tmpdir_path, "copy", True, False) + # Will store the relative paths in the zip file. + # These relative paths cannot be None but mypy doesn't know this. + file_to_relative: dict[ResourcePath, str] = {} + for p in paths: + rel = p.relative_to(tmpdir_path) + assert rel is not None + file_to_relative[p] = rel + + # Name of zip file. Options are: + # - uuid that changes every time. + # - uuid5 based on file paths in zip + # - uuid5 based on ref uuids. + # - checksum derived from the above. + # Start with uuid5 from file paths. + data = ",".join(file_to_relative.values()) + # No need to come up with a different namespace. + zip_uuid = uuid.uuid5(DatasetIdFactory.NS_UUID, data) + zip_file_name = f"{zip_uuid}.zip" + + # Index maps relative path to DatasetRef. + # The index has to: + # - list all the DatasetRef + # - map each artifact's path to a ref + # - include the datastore records to allow formatter to be + # extracted for that path and component be associated. + # Simplest not to try to include the full set of StoredItemInfo. + + # Convert the mappings to simplified form for pydantic. + # and use the relative paths that will match the zip. + simplified_refs = [ref.to_simple() for ref in refs] + simplified_ref_map = {} + for path, ids in refmap.items(): + simplified_ref_map[file_to_relative[path]] = ids + simplified_info_map = {file_to_relative[path]: info.to_simple() for path, info in infomap.items()} + + index = ZipIndex(refs=simplified_refs, ref_map=simplified_ref_map, info_map=simplified_info_map) + + zip_path = outdir.join(zip_file_name, forceDirectory=False) + with zipfile.ZipFile(zip_path.ospath, "w") as zip: + zip.writestr("_index.json", index.model_dump_json(exclude_defaults=True, exclude_unset=True)) + for path, name in file_to_relative.items(): + zip.write(path.ospath, name) + + return zip_path diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index 57bba73335..31cde89131 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -50,6 +50,7 @@ if TYPE_CHECKING: from lsst.daf.butler import Config, DatasetType, LookupKey from lsst.daf.butler.datastore import DatastoreOpaqueTable + from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager log = logging.getLogger(__name__) @@ -523,7 +524,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool | None = False, - ) -> list[ResourcePath]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. Parameters diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 7c89d77d2f..5eacd9eed3 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -67,6 +67,7 @@ from .._storage_class import StorageClass, StorageClassFactory from .._timespan import Timespan from ..datastore import Datastore, NullDatastore +from ..datastores.file_datastore.retrieve_artifacts import retrieve_and_zip from ..dimensions import DataCoordinate, Dimension from ..direct_query_driver import DirectQueryDriver from ..progress import Progress @@ -1290,6 +1291,13 @@ def find_dataset( return ref + def retrieve_artifacts_zip( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + ) -> ResourcePath: + return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], @@ -1299,13 +1307,14 @@ def retrieveArtifacts( overwrite: bool = False, ) -> list[ResourcePath]: # Docstring inherited. - return self._datastore.retrieveArtifacts( + paths, _, _ = self._datastore.retrieveArtifacts( refs, ResourcePath(destination), transfer=transfer, preserve_path=preserve_path, overwrite=overwrite, ) + return paths def exists( self, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index bda66fc20a..108c4744f9 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -30,6 +30,7 @@ __all__ = ("RemoteButler",) import uuid +from collections import defaultdict from collections.abc import Collection, Iterable, Iterator, Sequence from contextlib import AbstractContextManager, contextmanager from dataclasses import dataclass @@ -39,6 +40,7 @@ from deprecated.sphinx import deprecated from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ( determine_destination_for_retrieved_artifact, + retrieve_and_zip, ) from lsst.daf.butler.datastores.fileDatastoreClient import ( FileDatastoreGetPayload, @@ -57,6 +59,7 @@ from .._utilities.locked_object import LockedObject from ..datastore import DatasetRefURIs, DatastoreConfig from ..datastore.cache_manager import AbstractDatastoreCacheManager, DatastoreCacheManager +from ..datastore.stored_file_info import StoredFileInfo from ..dimensions import DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId from ..queries import Query from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults @@ -414,14 +417,14 @@ def find_dataset( ref = DatasetRef.from_simple(model.dataset_ref, universe=self.dimensions) return apply_storage_class_override(ref, dataset_type, storage_class) - def retrieveArtifacts( + def _retrieve_artifacts( self, refs: Iterable[DatasetRef], destination: ResourcePathExpression, transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, - ) -> list[ResourcePath]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: destination = ResourcePath(destination).abspath() if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}.") @@ -429,6 +432,8 @@ def retrieveArtifacts( if transfer not in ("auto", "copy"): raise ValueError("Only 'copy' and 'auto' transfer modes are supported.") + artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) + artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} output_uris: list[ResourcePath] = [] for ref in refs: file_info = _to_file_payload(self._get_file_info_for_ref(ref)).file_info @@ -442,8 +447,28 @@ def retrieveArtifacts( # after retrieving the URL. target_uri.transfer_from(source_uri, transfer="copy", overwrite=overwrite) output_uris.append(target_uri) + artifact_to_ref_id[target_uri].append(ref.id) + artifact_to_info[target_uri] = StoredFileInfo.from_simple(file.datastoreRecords) + + return output_uris, artifact_to_ref_id, artifact_to_info + + def retrieve_artifacts_zip( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + ) -> ResourcePath: + return retrieve_and_zip(refs, destination, self._retrieve_artifacts) - return output_uris + def retrieveArtifacts( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + transfer: str = "auto", + preserve_path: bool = True, + overwrite: bool = False, + ) -> list[ResourcePath]: + paths, _, _ = self._retrieve_artifacts(refs, destination, transfer, preserve_path, overwrite) + return paths def exists( self, diff --git a/python/lsst/daf/butler/tests/hybrid_butler.py b/python/lsst/daf/butler/tests/hybrid_butler.py index ebe8824015..030f3d5ffc 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler.py +++ b/python/lsst/daf/butler/tests/hybrid_butler.py @@ -206,6 +206,13 @@ def find_dataset( **kwargs, ) + def retrieve_artifacts_zip( + self, + refs: Iterable[DatasetRef], + destination: ResourcePathExpression, + ) -> ResourcePath: + return self._remote_butler.retrieve_artifacts_zip(refs, destination) + def retrieveArtifacts( self, refs: Iterable[DatasetRef], From 96d86d4f92aaab3dce822234f466cfea1e91cafc Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 14 Oct 2024 15:08:05 -0700 Subject: [PATCH 02/39] Trust that the datastore knows an artifact exists without checking Fpr chained datastore retrieveArtifacts was checking each dataset for file existence in a loop which can be really slow on GCS. Instead use knows_these and assume the datastore does have the file if it has the ref. --- python/lsst/daf/butler/datastores/chainedDatastore.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index 02ff472d5c..1a87d66061 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -862,7 +862,12 @@ def retrieveArtifacts( # cache is exactly what we should be doing. continue try: - datastore_refs = {ref for ref in pending if datastore.exists(ref)} + # Checking file existence is expensive. Have the option + # of checking whether the datastore knows of these datasets + # instead, which is fast but can potentially lead to + # retrieveArtifacts failing. + knows = datastore.knows_these(pending) + datastore_refs = {ref for ref, exists in knows.items() if exists} except NotImplementedError: # Some datastores may not support retrieving artifacts continue From 2489f4176cefbcc936e67532b9bafaeeee56ab85 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 15 Oct 2024 14:25:40 -0700 Subject: [PATCH 03/39] Write index file with retrieveArtifacts and use it for Zip Refactor some of the zip index code as part of this. --- .../lsst/daf/butler/datastore/_datastore.py | 6 + .../daf/butler/datastores/chainedDatastore.py | 13 ++ .../daf/butler/datastores/fileDatastore.py | 10 ++ .../file_datastore/retrieve_artifacts.py | 159 +++++++++++++----- .../butler/datastores/inMemoryDatastore.py | 5 + .../butler/direct_butler/_direct_butler.py | 14 +- .../butler/remote_butler/_remote_butler.py | 14 +- tests/test_butler.py | 2 + tests/test_cliCmdRetrieveArtifacts.py | 4 +- 9 files changed, 182 insertions(+), 45 deletions(-) diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index 7e71c81e05..289aee2df8 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -1024,6 +1024,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, + write_index: bool = True, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the artifacts associated with the supplied refs. @@ -1046,6 +1047,10 @@ def retrieveArtifacts( overwrite : `bool`, optional If `True` allow transfers to overwrite existing files at the destination. + write_index : `bool`, optional + If `True` write a file at the top level called ``_index.json`` + containing a serialization of a `ZipIndex` for the downloaded + datasets. Returns ------- @@ -1465,6 +1470,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, + write_index: bool = True, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: raise NotImplementedError("This is a no-op datastore that can not access a real datastore") diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index 1a87d66061..c9fd1da580 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -49,6 +49,7 @@ from lsst.daf.butler.datastore.constraints import Constraints from lsst.daf.butler.datastore.record_data import DatastoreRecordData from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo +from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ZipIndex from lsst.resources import ResourcePath from lsst.utils import doImportType @@ -805,6 +806,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, + write_index: bool = True, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. @@ -827,6 +829,10 @@ def retrieveArtifacts( overwrite : `bool`, optional If `True` allow transfers to overwrite existing files at the destination. + write_index : `bool`, optional + If `True` write a file at the top level called ``_index.json`` + containing a serialization of a `ZipIndex` for the downloaded + datasets. Returns ------- @@ -893,11 +899,18 @@ def retrieveArtifacts( transfer=transfer, preserve_path=preserve_path, overwrite=overwrite, + write_index=False, # Disable index writing regardless. ) targets.extend(retrieved) merged_artifacts_to_ref_id.update(artifacts_to_ref_id) merged_artifacts_to_info.update(artifacts_to_info) + if write_index: + index = ZipIndex.from_artifact_maps( + refs, merged_artifacts_to_ref_id, merged_artifacts_to_info, destination + ) + index.write_index(destination) + return targets, merged_artifacts_to_ref_id, merged_artifacts_to_info def remove(self, ref: DatasetRef) -> None: diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 2b0de81642..afc746d990 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -81,6 +81,7 @@ get_dataset_as_python_object_from_get_info, ) from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ( + ZipIndex, determine_destination_for_retrieved_artifact, ) from lsst.daf.butler.datastores.fileDatastoreClient import ( @@ -1953,6 +1954,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, + write_index: bool = True, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. @@ -1975,6 +1977,10 @@ def retrieveArtifacts( overwrite : `bool`, optional If `True` allow transfers to overwrite existing files at the destination. + write_index : `bool`, optional + If `True` write a file at the top level called ``_index.json`` + containing a serialization of a `ZipIndex` for the downloaded + datasets. Returns ------- @@ -2022,6 +2028,10 @@ def retrieveArtifacts( for source_uri, target_uri in to_transfer.items(): target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) + if write_index: + index = ZipIndex.from_artifact_maps(refs, artifact_to_ref_id, artifact_to_info, destination) + index.write_index(destination) + return list(to_transfer.values()), artifact_to_ref_id, artifact_to_info def get( diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index ab6cccb45d..ef20aea853 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -33,6 +33,7 @@ import uuid import zipfile from collections.abc import Callable, Iterable +from typing import ClassVar, Self from lsst.daf.butler import DatasetId, DatasetIdFactory, DatasetRef, SerializedDatasetRef from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo @@ -57,6 +58,114 @@ class ZipIndex(BaseModel): info_map: dict[str, SerializedStoredFileInfo] """Mapping of each Zip member to the associated datastore record.""" + index_name: ClassVar[str] = "_index.json" + """Name to use when saving the index to a file.""" + + def generate_uuid5(self) -> uuid.UUID: + """Create a UUID based on the Zip index. + + Returns + ------- + id_ : `uuid.UUID` + A UUID5 created from the paths inside the Zip file. Guarantees + that if the Zip file is regenerated with exactly the same file + paths the same answer will be returned. + """ + # Options are: + # - uuid5 based on file paths in zip + # - uuid5 based on ref uuids. + # - checksum derived from the above. + # - uuid5 from file paths and dataset refs. + # Do not attempt to include file contents in UUID. + # Start with uuid5 from file paths. + data = ",".join(self.info_map.keys()) + # No need to come up with a different namespace. + return uuid.uuid5(DatasetIdFactory.NS_UUID, data) + + def write_index(self, dir: ResourcePath) -> ResourcePath: + """Write the index to the specified directory. + + Parameters + ---------- + dir : `~lsst.resources.ResourcePath` + Directory to write the index file to. + + Returns + ------- + index_path : `~lsst.resources.ResourcePath` + Path to the index file that was written. + """ + index_path = dir.join(self.index_name, forceDirectory=False) + with index_path.open("w") as fd: + print(self.model_dump_json(exclude_defaults=True, exclude_unset=True), file=fd) + return index_path + + @classmethod + def calc_relative_paths( + cls, root: ResourcePath, paths: Iterable[ResourcePath] + ) -> dict[ResourcePath, str]: + """Calculate the path to use inside the Zip file from the full path. + + Parameters + ---------- + root : `lsst.resources.ResourcePath` + The reference root directory. + paths : `~collections.abc.Iterable` [ `lsst.resources.ResourcePath` ] + The paths to the files that should be included in the Zip file. + + Returns + ------- + abs_to_rel : `dict` [ `~lsst.resources.ResourcePath`, `str` ] + Mapping of the original file path to the relative path to use + in Zip file. + """ + file_to_relative: dict[ResourcePath, str] = {} + for p in paths: + # It is an error if there is no relative path. + rel = p.relative_to(root) + assert rel is not None + file_to_relative[p] = rel + return file_to_relative + + @classmethod + def from_artifact_maps( + cls, + refs: Iterable[DatasetRef], + id_map: dict[ResourcePath, list[DatasetId]], + info_map: dict[ResourcePath, StoredFileInfo], + root: ResourcePath, + ) -> Self: + """Create an index from the mappings returned from + `Datastore.retrieveArtifacts`. + + Parameters + ---------- + refs : `~collections.abc.Iterable` [ `list` ] + Datasets present in the index. + id_map : `dict` [ `lsst.resources.ResourcePath`, \ + `list` [ `uuid.UUID`] ] + Mapping of retrieved artifact path to `DatasetRef` ID. + info_map : `dict` [ `~lsst.resources.ResourcePath`, \ + `StoredDatastoreItemInfo` ] + Mapping of retrieved artifact path to datastore record information. + root : `lsst.resources.ResourcePath` + Root path to be removed from all the paths before creating the + index. + """ + # Calculate the paths relative to the given root since the Zip file + # uses relative paths. + file_to_relative = cls.calc_relative_paths(root, info_map.keys()) + + # Convert the mappings to simplified form for pydantic. + # and use the relative paths that will match the zip. + simplified_refs = [ref.to_simple() for ref in refs] + simplified_ref_map = {} + for path, ids in id_map.items(): + simplified_ref_map[file_to_relative[path]] = ids + simplified_info_map = {file_to_relative[path]: info.to_simple() for path, info in info_map.items()} + + return cls(refs=simplified_refs, ref_map=simplified_ref_map, info_map=simplified_info_map) + def determine_destination_for_retrieved_artifact( destination_directory: ResourcePath, source_path: ResourcePath, preserve_path: bool @@ -102,7 +211,7 @@ def retrieve_and_zip( refs: Iterable[DatasetRef], destination: ResourcePathExpression, retrieval_callback: Callable[ - [Iterable[DatasetRef], ResourcePath, str, bool, bool], + [Iterable[DatasetRef], ResourcePath, str, bool, bool, bool], tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]], ], ) -> ResourcePath: @@ -139,48 +248,20 @@ def retrieve_and_zip( # - Return name of zip file. with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) - paths, refmap, infomap = retrieval_callback(refs, tmpdir_path, "copy", True, False) - # Will store the relative paths in the zip file. - # These relative paths cannot be None but mypy doesn't know this. - file_to_relative: dict[ResourcePath, str] = {} - for p in paths: - rel = p.relative_to(tmpdir_path) - assert rel is not None - file_to_relative[p] = rel - - # Name of zip file. Options are: - # - uuid that changes every time. - # - uuid5 based on file paths in zip - # - uuid5 based on ref uuids. - # - checksum derived from the above. - # Start with uuid5 from file paths. - data = ",".join(file_to_relative.values()) - # No need to come up with a different namespace. - zip_uuid = uuid.uuid5(DatasetIdFactory.NS_UUID, data) - zip_file_name = f"{zip_uuid}.zip" - - # Index maps relative path to DatasetRef. - # The index has to: - # - list all the DatasetRef - # - map each artifact's path to a ref - # - include the datastore records to allow formatter to be - # extracted for that path and component be associated. - # Simplest not to try to include the full set of StoredItemInfo. - - # Convert the mappings to simplified form for pydantic. - # and use the relative paths that will match the zip. - simplified_refs = [ref.to_simple() for ref in refs] - simplified_ref_map = {} - for path, ids in refmap.items(): - simplified_ref_map[file_to_relative[path]] = ids - simplified_info_map = {file_to_relative[path]: info.to_simple() for path, info in infomap.items()} + # Retrieve the artifacts and write the index file. + paths, _, _ = retrieval_callback(refs, tmpdir_path, "auto", True, False, True) - index = ZipIndex(refs=simplified_refs, ref_map=simplified_ref_map, info_map=simplified_info_map) + # Read the index to construct file name. + index_path = tmpdir_path.join(ZipIndex.index_name, forceDirectory=False) + index_json = index_path.read() + index = ZipIndex.model_validate_json(index_json) + # Use unique name based on files in Zip. + zip_file_name = f"{index.generate_uuid5()}.zip" zip_path = outdir.join(zip_file_name, forceDirectory=False) with zipfile.ZipFile(zip_path.ospath, "w") as zip: - zip.writestr("_index.json", index.model_dump_json(exclude_defaults=True, exclude_unset=True)) - for path, name in file_to_relative.items(): + zip.write(index_path.ospath, index_path.basename()) + for path, name in index.calc_relative_paths(tmpdir_path, paths).items(): zip.write(path.ospath, name) return zip_path diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index 31cde89131..1149b610e6 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -524,6 +524,7 @@ def retrieveArtifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool | None = False, + write_index: bool = True, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. @@ -546,6 +547,10 @@ def retrieveArtifacts( overwrite : `bool`, optional If `True` allow transfers to overwrite existing files at the destination. + write_index : `bool`, optional + If `True` write a file at the top level called ``_index.json`` + containing a serialization of a `ZipIndex` for the downloaded + datasets. Notes ----- diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 5eacd9eed3..dd748a5df8 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -67,7 +67,7 @@ from .._storage_class import StorageClass, StorageClassFactory from .._timespan import Timespan from ..datastore import Datastore, NullDatastore -from ..datastores.file_datastore.retrieve_artifacts import retrieve_and_zip +from ..datastores.file_datastore.retrieve_artifacts import ZipIndex, retrieve_and_zip from ..dimensions import DataCoordinate, Dimension from ..direct_query_driver import DirectQueryDriver from ..progress import Progress @@ -1307,13 +1307,21 @@ def retrieveArtifacts( overwrite: bool = False, ) -> list[ResourcePath]: # Docstring inherited. - paths, _, _ = self._datastore.retrieveArtifacts( + outdir = ResourcePath(destination) + paths, id_map, info_map = self._datastore.retrieveArtifacts( refs, - ResourcePath(destination), + outdir, transfer=transfer, preserve_path=preserve_path, overwrite=overwrite, ) + # Write the index file. + index = ZipIndex.from_artifact_maps( + refs, id_map, info_map, ResourcePath(destination, forceDirectory=True) + ) + index_path = outdir.join("_index.json") + with index_path.open("w") as fd: + print(index.model_dump_json(exclude_defaults=True, exclude_unset=True), file=fd) return paths def exists( diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 108c4744f9..fe574eeeef 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -39,6 +39,7 @@ from deprecated.sphinx import deprecated from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ( + ZipIndex, determine_destination_for_retrieved_artifact, retrieve_and_zip, ) @@ -424,6 +425,7 @@ def _retrieve_artifacts( transfer: str = "auto", preserve_path: bool = True, overwrite: bool = False, + write_index: bool = True, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: destination = ResourcePath(destination).abspath() if not destination.isdir(): @@ -450,6 +452,10 @@ def _retrieve_artifacts( artifact_to_ref_id[target_uri].append(ref.id) artifact_to_info[target_uri] = StoredFileInfo.from_simple(file.datastoreRecords) + if write_index: + index = ZipIndex.from_artifact_maps(refs, artifact_to_ref_id, artifact_to_info, destination) + index.write_index(destination) + return output_uris, artifact_to_ref_id, artifact_to_info def retrieve_artifacts_zip( @@ -467,7 +473,13 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, ) -> list[ResourcePath]: - paths, _, _ = self._retrieve_artifacts(refs, destination, transfer, preserve_path, overwrite) + paths, _, _ = self._retrieve_artifacts( + refs, + destination, + transfer, + preserve_path, + overwrite, + ) return paths def exists( diff --git a/tests/test_butler.py b/tests/test_butler.py index ac696dba0a..d1cde82ef2 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -361,6 +361,8 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But ) self.assertGreater(len(transferred), 0) artifacts = list(ResourcePath.findFileResources([destination])) + # Filter out the index file. + artifacts = [a for a in artifacts if a.basename() != "_index.json"] self.assertEqual(set(transferred), set(artifacts)) for artifact in transferred: diff --git a/tests/test_cliCmdRetrieveArtifacts.py b/tests/test_cliCmdRetrieveArtifacts.py index 85e7bd9396..f5e6a65c9e 100644 --- a/tests/test_cliCmdRetrieveArtifacts.py +++ b/tests/test_cliCmdRetrieveArtifacts.py @@ -75,7 +75,7 @@ def testRetrieveAll(self): self.assertTrue(result.stdout.endswith(": 6\n"), f"Expected 6 got: {result.stdout}") artifacts = self.find_files(destdir) - self.assertEqual(len(artifacts), 6, f"Expected 6 artifacts: {artifacts}") + self.assertEqual(len(artifacts), 7, f"Expected 7 artifacts including index: {artifacts}") self.assertIn(f"{destdir}{prefix}", str(artifacts[1])) def testRetrieveSubset(self): @@ -95,7 +95,7 @@ def testRetrieveSubset(self): self.assertEqual(result.exit_code, 0, clickResultMsg(result)) self.assertTrue(result.stdout.endswith(": 3\n"), f"Expected 3 got: {result.stdout}") artifacts = self.find_files(destdir) - self.assertEqual(len(artifacts), 3, f"Expected 3 artifacts: {artifacts}") + self.assertEqual(len(artifacts), 4, f"Expected 4 artifacts including index: {artifacts}") def testOverwriteLink(self): runner = LogCliRunner() From cf25c1361d6006a480e1f2d2eba17f15756e0a21 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 15 Oct 2024 17:12:00 -0700 Subject: [PATCH 04/39] Add --zip command-line option to butler retrieve-artifacts --- python/lsst/daf/butler/cli/cmd/commands.py | 21 +++++++++++++------ .../file_datastore/retrieve_artifacts.py | 4 ++++ .../daf/butler/script/retrieveArtifacts.py | 19 +++++++++++++---- tests/test_cliCmdRetrieveArtifacts.py | 20 ++++++++++++++++++ 4 files changed, 54 insertions(+), 10 deletions(-) diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py index 119be85c20..32e17f436e 100644 --- a/python/lsst/daf/butler/cli/cmd/commands.py +++ b/python/lsst/daf/butler/cli/cmd/commands.py @@ -602,17 +602,26 @@ def query_dimension_records(**kwargs: Any) -> None: default=False, help="If clobber, overwrite files if they exist locally.", ) +@click.option( + "--zip/--no-zip", + is_flag=True, + default=False, + help="Retrieve artifacts and place in a Zip file.", +) @options_file_option() def retrieve_artifacts(**kwargs: Any) -> None: """Retrieve file artifacts associated with datasets in a repository.""" verbose = kwargs.pop("verbose") transferred = script.retrieveArtifacts(**kwargs) - if verbose and transferred: - print(f"Transferred the following to {kwargs['destination']}:") - for uri in transferred: - print(uri) - print() - print(f"Number of artifacts retrieved into destination {kwargs['destination']}: {len(transferred)}") + if kwargs["zip"] and transferred: + print(f"Zip files written to {transferred[0]}") + else: + if verbose and transferred: + print(f"Transferred the following to {kwargs['destination']}:") + for uri in transferred: + print(uri) + print() + print(f"Number of artifacts retrieved into destination {kwargs['destination']}: {len(transferred)}") @click.command(cls=ButlerCommand) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index ef20aea853..2c2b1cb1b6 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -239,6 +239,10 @@ def retrieve_and_zip( outdir = ResourcePath(destination, forceDirectory=True) if not outdir.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}") + + if not outdir.exists(): + outdir.mkdir() + # Simplest approach: # - create temp dir in destination # - Run retrieveArtifacts to that temp dir diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index a39e80862a..eea5537c66 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -54,6 +54,7 @@ def retrieveArtifacts( transfer: str, preserve_path: bool, clobber: bool, + zip: bool, ) -> list[ResourcePath]: """Parameters are those required for querying datasets plus a destination URI. @@ -89,11 +90,14 @@ def retrieveArtifacts( destination directory, else only the filename will be used. clobber : `bool` If `True` allow transfers to overwrite files at the destination. + zip : `bool` + If `True` retrieve the datasets and place in a zip file. Returns ------- transferred : `list` of `lsst.resources.ResourcePath` - The destination URIs of every transferred artifact. + The destination URIs of every transferred artifact or a list with a + single entry of the name of the zip file. """ query_types = dataset_type or "*" query_collections: tuple[str, ...] = collections or ("*",) @@ -114,8 +118,15 @@ def retrieveArtifacts( ) refs = list(itertools.chain(*query.getDatasets())) log.info("Number of datasets matching query: %d", len(refs)) + if not refs: + return [] + + if not zip: + transferred = butler.retrieveArtifacts( + refs, destination=destination, transfer=transfer, preserve_path=preserve_path, overwrite=clobber + ) + else: + zip_file = butler.retrieve_artifacts_zip(refs, destination=destination) + transferred = [zip_file] - transferred = butler.retrieveArtifacts( - refs, destination=destination, transfer=transfer, preserve_path=preserve_path, overwrite=clobber - ) return transferred diff --git a/tests/test_cliCmdRetrieveArtifacts.py b/tests/test_cliCmdRetrieveArtifacts.py index f5e6a65c9e..43f4a02663 100644 --- a/tests/test_cliCmdRetrieveArtifacts.py +++ b/tests/test_cliCmdRetrieveArtifacts.py @@ -97,6 +97,26 @@ def testRetrieveSubset(self): artifacts = self.find_files(destdir) self.assertEqual(len(artifacts), 4, f"Expected 4 artifacts including index: {artifacts}") + def testRetrieveAsZip(self): + runner = LogCliRunner() + with runner.isolated_filesystem(): + destdir = "tmp1/" + result = runner.invoke( + cli, + [ + "retrieve-artifacts", + self.root, + destdir, + "--where", + "instrument='DummyCamComp' AND visit=423", + "--zip", + ], + ) + self.assertEqual(result.exit_code, 0, clickResultMsg(result)) + self.assertIn("a90bbdc0-6c95-5352-b25b-d8ccf5b08adc.zip", result.stdout) + artifacts = self.find_files(destdir) + self.assertEqual(len(artifacts), 1, f"Expected one zip file: {artifacts}") + def testOverwriteLink(self): runner = LogCliRunner() with runner.isolated_filesystem(): From 81c827d3b3ae3e86899825cdb22d5bf3ee615dd2 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 16 Oct 2024 15:05:28 -0700 Subject: [PATCH 05/39] Change the internal representation of ZipIndex to remove duplication No longer use list[DatasetRef] but use SerializedDatasetRefContainer --- .../file_datastore/retrieve_artifacts.py | 161 +++++++++++++++++- 1 file changed, 154 insertions(+), 7 deletions(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 2c2b1cb1b6..eaf184b8e1 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -29,17 +29,149 @@ __all__ = ("determine_destination_for_retrieved_artifact", "retrieve_and_zip", "ZipIndex") +import logging import tempfile import uuid import zipfile from collections.abc import Callable, Iterable from typing import ClassVar, Self -from lsst.daf.butler import DatasetId, DatasetIdFactory, DatasetRef, SerializedDatasetRef +from lsst.daf.butler import DatasetId, DatasetIdFactory, DatasetRef from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo from lsst.resources import ResourcePath, ResourcePathExpression from pydantic import BaseModel +from ..._dataset_type import DatasetType, SerializedDatasetType +from ...dimensions import DataCoordinate, DimensionUniverse, SerializedDataCoordinate, SerializedDataId + +_LOG = logging.getLogger(__name__) + + +class MinimalistDatasetRef(BaseModel): + """Minimal information needed to define a DatasetRef. + + The ID is not included and is presumed to be the key to a mapping + to this information. + """ + + dataset_type_name: str + """Name of the dataset type.""" + + run: str + """Name of the RUN collection.""" + + data_id: SerializedDataId + """Data coordinate of this dataset.""" + + +class SerializedDatasetRefContainer(BaseModel): + """Serializable model for a collection of DatasetRef. + + Dimension records are not included. + """ + + universe_version: int + """Dimension universe version.""" + + universe_namespace: str + """Dimension universe namespace.""" + + dataset_types: dict[str, SerializedDatasetType] + """Dataset types indexed by their name.""" + + compact_refs: dict[uuid.UUID, MinimalistDatasetRef] + """Minimal dataset ref information indexed by UUID.""" + + @classmethod + def from_refs(cls, refs: Iterable[DatasetRef]) -> Self: + """Construct a serializable form from a list of `DatasetRef`. + + Parameters + ---------- + refs : `~collections.abc.Iterable` [ `DatasetRef` ] + The datasets to include in the container. + """ + # The serialized DatasetRef contains a lot of duplicated information. + # We also want to drop dimension records and assume that the records + # are already in the registry. + universe: DimensionUniverse | None = None + dataset_types: dict[str, SerializedDatasetType] = {} + compact_refs: dict[uuid.UUID, MinimalistDatasetRef] = {} + for ref in refs: + simple_ref = ref.to_simple() + dataset_type = simple_ref.datasetType + assert dataset_type is not None # For mypy + if universe is None: + universe = ref.datasetType.dimensions.universe + if (name := dataset_type.name) not in dataset_types: + dataset_types[name] = dataset_type + data_id = simple_ref.dataId + assert data_id is not None # For mypy + compact_refs[simple_ref.id] = MinimalistDatasetRef( + dataset_type_name=name, run=simple_ref.run, data_id=data_id.dataId + ) + if universe: + universe_version = universe.version + universe_namespace = universe.namespace + else: + # No refs so no universe. + universe_version = 0 + universe_namespace = "unknown" + return cls( + universe_version=universe_version, + universe_namespace=universe_namespace, + dataset_types=dataset_types, + compact_refs=compact_refs, + ) + + def to_refs(self, universe: DimensionUniverse) -> list[DatasetRef]: + """Construct the original `DatasetRef`. + + Parameters + ---------- + universe : `DimensionUniverse` + The universe to use when constructing the `DatasetRef`. + + Returns + ------- + refs : `list` [ `DatasetRef` ] + The `DatasetRef` that were serialized. + """ + if not self.compact_refs: + return [] + + if universe.namespace != self.universe_namespace: + raise RuntimeError( + f"Can not convert to refs in universe {universe.namespace} that were created frp, " + f"universe {self.universe_namespace}" + ) + + if universe.version != self.universe_version: + _LOG.warning( + "Universe mismatch when attempting to reconstruct DatasetRef from serialized form. " + "Serialized with version %d but asked to use version %d.", + self.universe_version, + universe.version, + ) + + # Reconstruct the DatasetType objects. + dataset_types = { + name: DatasetType.from_simple(dtype, universe=universe) + for name, dtype in self.dataset_types.items() + } + refs: list[DatasetRef] = [] + for id_, minimal in self.compact_refs.items(): + simple_data_id = SerializedDataCoordinate(dataId=minimal.data_id) + data_id = DataCoordinate.from_simple(simple=simple_data_id, universe=universe) + ref = DatasetRef( + id=id_, + run=minimal.run, + datasetType=dataset_types[minimal.dataset_type_name], + dataId=data_id, + ) + refs.append(ref) + return refs + class ZipIndex(BaseModel): """Index of a Zip file of Butler datasets. @@ -50,11 +182,12 @@ class ZipIndex(BaseModel): file datastore. """ - refs: list[SerializedDatasetRef] - """The Butler datasets stored in the Zip file.""" - # Can have multiple refs associated with a single file. + refs: SerializedDatasetRefContainer + """Deduplicated information for all the `DatasetRef` in the index.""" + ref_map: dict[str, list[uuid.UUID]] """Mapping of Zip member to one or more dataset UUIDs.""" + info_map: dict[str, SerializedStoredFileInfo] """Mapping of each Zip member to the associated datastore record.""" @@ -140,7 +273,7 @@ def from_artifact_maps( Parameters ---------- - refs : `~collections.abc.Iterable` [ `list` ] + refs : `~collections.abc.Iterable` [ `DatasetRef` ] Datasets present in the index. id_map : `dict` [ `lsst.resources.ResourcePath`, \ `list` [ `uuid.UUID`] ] @@ -152,19 +285,33 @@ def from_artifact_maps( Root path to be removed from all the paths before creating the index. """ + if not refs: + return cls( + universe_version=0, + universe_namespace="daf_butler", + refs=SerializedDatasetRefContainer.from_refs(refs), + ref_map={}, + info_map={}, + ) + # Calculate the paths relative to the given root since the Zip file # uses relative paths. file_to_relative = cls.calc_relative_paths(root, info_map.keys()) + simplified_refs = SerializedDatasetRefContainer.from_refs(refs) + # Convert the mappings to simplified form for pydantic. # and use the relative paths that will match the zip. - simplified_refs = [ref.to_simple() for ref in refs] simplified_ref_map = {} for path, ids in id_map.items(): simplified_ref_map[file_to_relative[path]] = ids simplified_info_map = {file_to_relative[path]: info.to_simple() for path, info in info_map.items()} - return cls(refs=simplified_refs, ref_map=simplified_ref_map, info_map=simplified_info_map) + return cls( + refs=simplified_refs, + ref_map=simplified_ref_map, + info_map=simplified_info_map, + ) def determine_destination_for_retrieved_artifact( From fafce2be05ea1fa567e09904016489393efcc1d8 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 16 Oct 2024 15:22:34 -0700 Subject: [PATCH 06/39] Add method to create ZipIndex from zip file --- .../datastores/file_datastore/retrieve_artifacts.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index eaf184b8e1..12666588f0 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -313,6 +313,19 @@ def from_artifact_maps( info_map=simplified_info_map, ) + @classmethod + def from_zip_file(cls, zip_path: ResourcePath) -> Self: + """Given a path to a Zip file return the index. + + Parameters + ---------- + zip_path : `lsst.resources.ResourcePath` + Path to the Zip file. + """ + with zip_path.open("rb") as fd, zipfile.ZipFile(fd) as zf: + json_data = zf.read(cls.index_name) + return cls.model_validate_json(json_data) + def determine_destination_for_retrieved_artifact( destination_directory: ResourcePath, source_path: ResourcePath, preserve_path: bool From 0bafc7a13b1ab6234cb5c1a8a04bfc9c100b59f0 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Thu, 17 Oct 2024 14:45:17 -0700 Subject: [PATCH 07/39] First attempt at butler ingest_zip --- python/lsst/daf/butler/_butler.py | 19 ++++ .../lsst/daf/butler/datastore/_datastore.py | 18 ++++ .../daf/butler/datastores/chainedDatastore.py | 43 +++++++++ .../daf/butler/datastores/fileDatastore.py | 81 +++++++++++++++++ .../file_datastore/retrieve_artifacts.py | 21 ++++- .../butler/datastores/inMemoryDatastore.py | 3 + .../butler/direct_butler/_direct_butler.py | 91 +++++++++++++++++-- .../butler/remote_butler/_remote_butler.py | 4 + python/lsst/daf/butler/tests/hybrid_butler.py | 4 + 9 files changed, 273 insertions(+), 11 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 9593c13f1b..19341e9ed2 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -1227,6 +1227,25 @@ def ingest( """ raise NotImplementedError() + @abstractmethod + def ingest_zip(self, zip_file: ResourcePathExpression, transfer: str = "auto") -> None: + """Ingest a Zip file into this butler. + + The Zip file must have been created by `retrieve_artifacts_zip`. + + Parameters + ---------- + zip_file : `lsst.resources.ResourcePathExpression` + Path to the Zip file. + transfer : `str`, optional + Method to use to transfer the Zip into the datastore. + + Notes + ----- + Run collections are created as needed. + """ + raise NotImplementedError() + @abstractmethod def export( self, diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index 289aee2df8..06c1e9731c 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -1073,6 +1073,21 @@ def retrieveArtifacts( """ raise NotImplementedError() + @abstractmethod + def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: + """Ingest an indexed Zip file and contents. + + The Zip file must have an index file as created by `retrieveArtifacts`. + + Parameters + ---------- + zip_path : `lsst.resources.ResourcePath` + Path to the Zip file. + transfer : `str` + Method to use for transferring the Zip file into the datastore. + """ + raise NotImplementedError() + @abstractmethod def remove(self, datasetRef: DatasetRef) -> None: """Indicate to the Datastore that a Dataset can be removed. @@ -1463,6 +1478,9 @@ def getURIs(self, datasetRef: DatasetRef, predict: bool = False) -> DatasetRefUR def getURI(self, datasetRef: DatasetRef, predict: bool = False) -> ResourcePath: raise FileNotFoundError("This is a no-op datastore that can not access a real datastore") + def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: + raise NotImplementedError("Can only ingest a Zip into a real datastore.") + def retrieveArtifacts( self, refs: Iterable[DatasetRef], diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index c9fd1da580..8f7ed8ceb6 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -913,6 +913,49 @@ def retrieveArtifacts( return targets, merged_artifacts_to_ref_id, merged_artifacts_to_info + def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: + """Ingest an indexed Zip file and contents. + + The Zip file must have an index file as created by `retrieveArtifacts`. + + Parameters + ---------- + zip_path : `lsst.resources.ResourcePath` + Path to the Zip file. + transfer : `str` + Method to use for transferring the Zip file into the datastore. + + Notes + ----- + Datastore constraints are bypassed with Zip ingest. A zip file can + contain multiple dataset types. Should the entire Zip be rejected + if one dataset type is in the constraints list? If configured to + reject everything, ingest should not be attempted. + + The Zip file is given to each datastore in turn, ignoring datastores + where it is not supported. Is deemed successful if any of the + datastores accept the file. + """ + n_success = 0 + final_exception: Exception | None = None + for number, datastore in enumerate(self.datastores): + if datastore.isEphemeral: + continue + try: + datastore.ingest_zip(zip_path, transfer=transfer) + except NotImplementedError: + continue + except Exception as e: + final_exception = e + else: + n_success += 1 + + if n_success: + return + if final_exception: + raise final_exception + raise RuntimeError("Ingest was not successful in any datastores.") + def remove(self, ref: DatasetRef) -> None: """Indicate to the datastore that a dataset can be removed. diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index afc746d990..bf881f56e9 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -297,6 +297,8 @@ def __init__( else: self.cacheManager = DatastoreDisabledCacheManager("", universe=bridgeManager.universe) + self.universe = bridgeManager.universe + @classmethod def _create_from_config( cls, @@ -2034,6 +2036,85 @@ def retrieveArtifacts( return list(to_transfer.values()), artifact_to_ref_id, artifact_to_info + def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: + """Ingest an indexed Zip file and contents. + + The Zip file must have an index file as created by `retrieveArtifacts`. + + Parameters + ---------- + zip_path : `lsst.resources.ResourcePath` + Path to the Zip file. + transfer : `str` + Method to use for transferring the Zip file into the datastore. + + Notes + ----- + Datastore constraints are bypassed with Zip ingest. A zip file can + contain multiple dataset types. Should the entire Zip be rejected + if one dataset type is in the constraints list? + + If any dataset is already present in the datastore the entire ingest + will fail. + """ + index = ZipIndex.from_zip_file(zip_path) + + # Transfer the Zip file into the datastore file system. + # There is no RUN as such to use for naming. + # Potentially could use the RUN from the first ref in the index + # There is no requirement that the contents of the Zip files share + # the same RUN. + # Could use the Zip UUID from the index + special "zips/" prefix. + if transfer is None: + # Indicated that the zip file is already in the right place. + if not zip_path.isabs(): + tgtLocation = self.locationFactory.fromPath(zip_path.ospath, trusted_path=False) + else: + pathInStore = zip_path.relative_to(self.root) + if pathInStore is None: + raise RuntimeError( + f"Unexpectedly learned that {zip_path} is not within datastore {self.root}" + ) + tgtLocation = self.locationFactory.fromPath(pathInStore, trusted_path=True) + elif transfer == "direct": + # Reference in original location. + tgtLocation = None + else: + zip_name = index.calculate_zip_file_name() + # Zip name is UUID so add subdir of first few characters of UUID + # to spread things out if there are many Zip files. + tgtLocation = self.locationFactory.fromPath(f"zips/{zip_name[:4]}/{zip_name}") + if not tgtLocation.uri.dirname().exists(): + log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) + tgtLocation.uri.dirname().mkdir() + + # Transfer the Zip file into the datastore. + tgtLocation.uri.transfer_from( + zip_path, transfer=transfer, transaction=self._transaction, overwrite=True + ) + + if tgtLocation is None: + path_in_store = str(zip_path) + else: + path_in_store = tgtLocation.pathInStore.path + + # Refs indexed by UUID. + refs = index.refs.to_refs(universe=self.universe) + id_to_ref = {ref.id: ref for ref in refs} + + # Associate each file with a (DatasetRef, StoredFileInfo) tuple. + artifacts: list[tuple[DatasetRef, StoredFileInfo]] = [] + for path_in_zip, serialized_info in index.info_map.items(): + # Need to modify the info to include the path to the Zip file + # that was previously written to the datastore. + serialized_info.path = f"{path_in_store}#zip-path={path_in_zip}" + + info = StoredFileInfo.from_simple(serialized_info) + for id_ in index.ref_map[path_in_zip]: + artifacts.append((id_to_ref[id_], info)) + + self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) + def get( self, ref: DatasetRef, diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 12666588f0..233adce484 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -82,6 +82,10 @@ class SerializedDatasetRefContainer(BaseModel): compact_refs: dict[uuid.UUID, MinimalistDatasetRef] """Minimal dataset ref information indexed by UUID.""" + def __len__(self) -> int: + """Return the number of datasets in the container.""" + return len(self.compact_refs) + @classmethod def from_refs(cls, refs: Iterable[DatasetRef]) -> Self: """Construct a serializable form from a list of `DatasetRef`. @@ -215,6 +219,21 @@ def generate_uuid5(self) -> uuid.UUID: # No need to come up with a different namespace. return uuid.uuid5(DatasetIdFactory.NS_UUID, data) + def __len__(self) -> int: + """Return the number of files in the Zip.""" + return len(self.info_map) + + def calculate_zip_file_name(self) -> str: + """Calculate the default name for the Zip file based on the index + contents. + + Returns + ------- + name : `str` + Name of the zip file based on index. + """ + return f"{self.generate_uuid5()}.zip" + def write_index(self, dir: ResourcePath) -> ResourcePath: """Write the index to the specified directory. @@ -421,7 +440,7 @@ def retrieve_and_zip( index = ZipIndex.model_validate_json(index_json) # Use unique name based on files in Zip. - zip_file_name = f"{index.generate_uuid5()}.zip" + zip_file_name = index.calculate_zip_file_name() zip_path = outdir.join(zip_file_name, forceDirectory=False) with zipfile.ZipFile(zip_path.ospath, "w") as zip: zip.write(index_path.ospath, index_path.basename()) diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index 1149b610e6..f66915bef6 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -517,6 +517,9 @@ def getURI(self, ref: DatasetRef, predict: bool = False) -> ResourcePath: raise AssertionError(f"Unexpectedly got no URI for in-memory datastore for {ref}") return primary + def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: + raise NotImplementedError("Can only ingest a Zip into a file datastore.") + def retrieveArtifacts( self, refs: Iterable[DatasetRef], diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index dd748a5df8..88715803a2 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -41,6 +41,7 @@ import logging import numbers import os +import uuid import warnings from collections import Counter, defaultdict from collections.abc import Iterable, Iterator, MutableMapping, Sequence @@ -62,6 +63,7 @@ from .._dataset_type import DatasetType from .._deferredDatasetHandle import DeferredDatasetHandle from .._exceptions import DatasetNotFoundError, DimensionValueError, EmptyQueryResultError, ValidationError +from .._file_dataset import FileDataset from .._limited_butler import LimitedButler from .._registry_shim import RegistryShim from .._storage_class import StorageClass, StorageClassFactory @@ -88,7 +90,6 @@ from lsst.resources import ResourceHandleProtocol from .._dataset_ref import DatasetId - from .._file_dataset import FileDataset from ..datastore import DatasetRefURIs from ..dimensions import DataId, DataIdValue, DimensionElement, DimensionRecord, DimensionUniverse from ..registry import CollectionArgType, Registry @@ -1518,17 +1519,68 @@ def pruneDatasets( self._datastore.emptyTrash() @transactional - def ingest( - self, - *datasets: FileDataset, - transfer: str | None = "auto", - record_validation_info: bool = True, - ) -> None: - # Docstring inherited. + def ingest_zip(self, zip_file: ResourcePathExpression, transfer: str = "auto") -> None: + """Ingest a Zip file into this butler. + + The Zip file must have been created by `retrieve_artifacts_zip`. + + Parameters + ---------- + zip_file : `lsst.resources.ResourcePathExpression` + Path to the Zip file. + transfer : `str`, optional + Method to use to transfer the Zip into the datastore. + + Notes + ----- + Run collections are created as needed. + """ if not self.isWriteable(): raise TypeError("Butler is read-only.") - _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") + zip_path = ResourcePath(zip_file) + index = ZipIndex.from_zip_file(zip_path) + _LOG.verbose( + "Ingesting %s containing %d datasets and %d files.", zip_path, len(index.refs), len(index) + ) + + # Need to ingest the refs into registry. Re-use the standard ingest + # code by reconstructing FileDataset from the index. + refs = index.refs.to_refs(universe=self.dimensions) + id_to_ref = {ref.id: ref for ref in refs} + datasets: list[FileDataset] = [] + processed_ids: set[uuid.UUID] = set() + for path, ids in index.ref_map.items(): + # Disassembled composites need to check this ref isn't already + # included. + unprocessed = {id_ for id_ in ids if id_ not in processed_ids} + if not unprocessed: + continue + dataset = FileDataset(refs=[id_to_ref[id_] for id_ in unprocessed], path=path) + datasets.append(dataset) + processed_ids.update(unprocessed) + + # Ingest doesn't create the RUN collections so we have to do that + # here. + runs = {ref.run for ref in refs} + for run in runs: + registered = self.collections.register(run) + if registered: + _LOG.verbose("Created RUN collection %s as part of zip ingest", run) + + # Do not need expanded dataset refs so can ignore the return value. + self._ingest_file_datasets(datasets) + + try: + self._datastore.ingest_zip(zip_path, transfer=transfer) + except IntegrityError as e: + raise ConflictingDefinitionError(f"Datastore already contains one or more datasets: {e}") from e + + def _ingest_file_datasets( + self, + datasets: Sequence[FileDataset], + ) -> None: + # Docstring inherited. if not datasets: return @@ -1600,7 +1652,10 @@ def ingest( # guarantee that they are expanded and Datastore will need # the records. imported_refs = self._registry._importDatasets(refs_to_import, expand=True) - assert set(imported_refs) == set(refs_to_import) + + # Since refs can come from an external butler, there is no + # guarantee that the DatasetType values match. Compare IDs. + assert {ref.id for ref in imported_refs} == {ref.id for ref in refs_to_import} # Replace all the refs in the FileDataset with expanded versions. # Pull them off in the order we put them on the list. @@ -1609,6 +1664,22 @@ def ingest( dataset.refs = imported_refs[:n_dataset_refs] del imported_refs[:n_dataset_refs] + @transactional + def ingest( + self, + *datasets: FileDataset, + transfer: str | None = "auto", + record_validation_info: bool = True, + ) -> None: + # Docstring inherited. + if not datasets: + return + if not self.isWriteable(): + raise TypeError("Butler is read-only.") + _LOG.verbose("Ingesting %d file dataset%s.", len(datasets), "" if len(datasets) == 1 else "s") + + self._ingest_file_datasets(datasets) + # Bulk-insert everything into Datastore. # We do not know if any of the registry entries already existed # (_importDatasets only complains if they exist but differ) so diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index fe574eeeef..4076b5ab2e 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -535,6 +535,10 @@ def ingest( # Docstring inherited. raise NotImplementedError() + def ingest_zip(self, zip_file: ResourcePathExpression, transfer: str = "auto") -> None: + # Docstring inherited. + raise NotImplementedError() + def export( self, *, diff --git a/python/lsst/daf/butler/tests/hybrid_butler.py b/python/lsst/daf/butler/tests/hybrid_butler.py index 030f3d5ffc..5fef275fbf 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler.py +++ b/python/lsst/daf/butler/tests/hybrid_butler.py @@ -249,6 +249,10 @@ def _exists_many( def removeRuns(self, names: Iterable[str], unstore: bool = True) -> None: return self._direct_butler.removeRuns(names, unstore) + def ingest_zip(self, zip_file: ResourcePathExpression, transfer: str = "auto") -> None: + # Docstring inherited. + return self._direct_butler.ingest_zip(zip_file, transfer=transfer) + def ingest( self, *datasets: FileDataset, From 1d6ce5196ac31ec501000c01923f11281a109487 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 18 Oct 2024 08:46:35 -0700 Subject: [PATCH 08/39] Strip directory paths when building zip file. This should be fine since we attempt to make the file names unique. It also works around an issue in ResourcePath where a "#" character is treated as part of a directory path if found in the non-file part of the path. --- .../butler/datastores/file_datastore/retrieve_artifacts.py | 4 ++-- tests/test_cliCmdRetrieveArtifacts.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 233adce484..c20cec5551 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -431,8 +431,8 @@ def retrieve_and_zip( # - Return name of zip file. with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) - # Retrieve the artifacts and write the index file. - paths, _, _ = retrieval_callback(refs, tmpdir_path, "auto", True, False, True) + # Retrieve the artifacts and write the index file. Strip paths. + paths, _, _ = retrieval_callback(refs, tmpdir_path, "auto", False, False, True) # Read the index to construct file name. index_path = tmpdir_path.join(ZipIndex.index_name, forceDirectory=False) diff --git a/tests/test_cliCmdRetrieveArtifacts.py b/tests/test_cliCmdRetrieveArtifacts.py index 43f4a02663..2ff7a5b250 100644 --- a/tests/test_cliCmdRetrieveArtifacts.py +++ b/tests/test_cliCmdRetrieveArtifacts.py @@ -113,7 +113,7 @@ def testRetrieveAsZip(self): ], ) self.assertEqual(result.exit_code, 0, clickResultMsg(result)) - self.assertIn("a90bbdc0-6c95-5352-b25b-d8ccf5b08adc.zip", result.stdout) + self.assertIn("2ac4dd89-17ac-5354-9c15-cfec8dab5b3b.zip", result.stdout) artifacts = self.find_files(destdir) self.assertEqual(len(artifacts), 1, f"Expected one zip file: {artifacts}") From 642b90f0c4db362b940a63baf97647bf69f6240a Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 18 Oct 2024 14:15:16 -0700 Subject: [PATCH 09/39] Add butler ingest-zip command --- python/lsst/daf/butler/cli/cmd/__init__.py | 2 + python/lsst/daf/butler/cli/cmd/commands.py | 15 ++++++ python/lsst/daf/butler/script/__init__.py | 1 + python/lsst/daf/butler/script/ingest_zip.py | 52 +++++++++++++++++++++ 4 files changed, 70 insertions(+) create mode 100644 python/lsst/daf/butler/script/ingest_zip.py diff --git a/python/lsst/daf/butler/cli/cmd/__init__.py b/python/lsst/daf/butler/cli/cmd/__init__.py index 5b4c9b4164..fa12602ca1 100644 --- a/python/lsst/daf/butler/cli/cmd/__init__.py +++ b/python/lsst/daf/butler/cli/cmd/__init__.py @@ -35,6 +35,7 @@ "config_validate", "export_calibs", "ingest_files", + "ingest_zip", "prune_datasets", "query_collections", "query_data_ids", @@ -61,6 +62,7 @@ create, export_calibs, ingest_files, + ingest_zip, prune_datasets, query_collections, query_data_ids, diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py index 32e17f436e..804dfcbaa1 100644 --- a/python/lsst/daf/butler/cli/cmd/commands.py +++ b/python/lsst/daf/butler/cli/cmd/commands.py @@ -813,3 +813,18 @@ def export_calibs(*args: Any, **kwargs: Any) -> None: table = script.exportCalibs(*args, **kwargs) if table: table.pprint_all(align="<") + + +@click.command(cls=ButlerCommand) +@repo_argument(required=True) +@click.argument("zip", required=True) +@transfer_option() +def ingest_zip(**kwargs: Any) -> None: + """Ingest a Zip file created by retrieve-artifacts. + + ZIP is the URI to the Zip file that should be ingested. + + This command does not create dimension records and so any records must + be created by other means. + """ + script.ingest_zip(**kwargs) diff --git a/python/lsst/daf/butler/script/__init__.py b/python/lsst/daf/butler/script/__init__.py index 964e1ddef7..45c2428a60 100644 --- a/python/lsst/daf/butler/script/__init__.py +++ b/python/lsst/daf/butler/script/__init__.py @@ -35,6 +35,7 @@ from .createRepo import createRepo from .exportCalibs import exportCalibs from .ingest_files import ingest_files +from .ingest_zip import ingest_zip from .queryCollections import queryCollections from .queryDataIds import queryDataIds from .queryDatasets import QueryDatasets diff --git a/python/lsst/daf/butler/script/ingest_zip.py b/python/lsst/daf/butler/script/ingest_zip.py new file mode 100644 index 0000000000..13f1695c31 --- /dev/null +++ b/python/lsst/daf/butler/script/ingest_zip.py @@ -0,0 +1,52 @@ +# This file is part of daf_butler. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This software is dual licensed under the GNU General Public License and also +# under a 3-clause BSD license. Recipients may choose which of these licenses +# to use; please see the files gpl-3.0.txt and/or bsd_license.txt, +# respectively. If you choose the GPL option then the following text applies +# (but note that there is still no warranty even if you opt for BSD instead): +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +from __future__ import annotations + +__all__ = ["ingest_zip"] + +from .._butler import Butler + + +def ingest_zip( + repo: str, + zip: str, + transfer: str = "auto", +) -> None: + """Ingest a Zip file into the given butler. + + Parameters + ---------- + repo : `str` + URI string of the Butler repo to use. + zip : `str` + URI string of the location of the Zip file. + transfer : `str`, optional + Transfer mode to use for ingest. + """ + butler = Butler.from_config(repo, writeable=True) + + butler.ingest_zip(zip, transfer=transfer) From d19df2e6fdaac0fabfe6c42589648d5cbe1e618e Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 21 Oct 2024 16:11:26 -0700 Subject: [PATCH 10/39] Do not remove Zip file until last dataset in Zip is removed --- .../daf/butler/datastore/stored_file_info.py | 7 ++ .../daf/butler/datastores/fileDatastore.py | 99 +++++++++++++------ python/lsst/daf/butler/registry/opaque.py | 13 ++- 3 files changed, 86 insertions(+), 33 deletions(-) diff --git a/python/lsst/daf/butler/datastore/stored_file_info.py b/python/lsst/daf/butler/datastore/stored_file_info.py index ce9eb3303b..5dbc3f6711 100644 --- a/python/lsst/daf/butler/datastore/stored_file_info.py +++ b/python/lsst/daf/butler/datastore/stored_file_info.py @@ -365,6 +365,13 @@ def update(self, **kwargs: Any) -> StoredFileInfo: def __reduce__(self) -> str | tuple[Any, ...]: return (self.from_record, (self.to_record(),)) + @property + def artifact_path(self) -> str: + """Path to dataset as stored in Datastore with fragments removed.""" + if "#" in self.path: + return self.path[: self.path.rfind("#")] + return self.path + class SerializedStoredFileInfo(pydantic.BaseModel): """Serialized representation of `StoredFileInfo` properties.""" diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index bf881f56e9..ecd11f82cb 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -569,23 +569,50 @@ def _get_stored_records_associated_with_refs( records_by_ref[record["dataset_id"]].append(StoredFileInfo.from_record(record)) return records_by_ref - def _refs_associated_with_artifacts(self, paths: list[str | ResourcePath]) -> dict[str, set[DatasetId]]: + def _refs_associated_with_artifacts( + self, paths: Iterable[str | ResourcePath] + ) -> dict[str, set[DatasetId]]: """Return paths and associated dataset refs. Parameters ---------- paths : `list` of `str` or `lsst.resources.ResourcePath` - All the paths to include in search. + All the paths to include in search. These are exact matches + to the entries in the records table and can include fragments. Returns ------- mapping : `dict` of [`str`, `set` [`DatasetId`]] Mapping of each path to a set of associated database IDs. + These are artifacts and so any fragments are stripped from the + keys. """ - records = self._table.fetch(path=[str(path) for path in paths]) - result = defaultdict(set) - for row in records: - result[row["path"]].add(row["dataset_id"]) + # Group paths by those that have fragments and those that do not. + with_fragment = set() + without_fragment = set() + for rpath in paths: + spath = str(rpath) # Typing says can be ResourcePath so must force to string. + if "#" in spath: + spath, fragment = spath.rsplit("#", 1) + with_fragment.add(spath) + else: + without_fragment.add(spath) + + result: dict[str, set[DatasetId]] = defaultdict(set) + if without_fragment: + records = self._table.fetch(path=without_fragment) + for row in records: + path = row["path"] + result[path].add(row["dataset_id"]) + if with_fragment: + # Do a query per prefix. + for path in with_fragment: + records = self._table.fetch(path=f"{path}#%") + for row in records: + # Need to strip fragments before adding to dict. + row_path = row["path"] + artifact_path = row_path[: row_path.rfind("#")] + result[artifact_path].add(row["dataset_id"]) return result def _registered_refs_per_artifact(self, pathInStore: ResourcePath) -> set[DatasetId]: @@ -2405,36 +2432,49 @@ def emptyTrash(self, ignore_errors: bool = True) -> None: # This requires multiple copies of the trashed items trashed, artifacts_to_keep = trash_data - if artifacts_to_keep is None: + # Assume that # in path means there are fragments involved. The + # fragments can not be handled by the emptyTrash bridge call + # so need to be processed independently. + # The generator has to be converted to a list for multiple + # iterations. Clean up the typing so that multiple isinstance + # tests aren't needed later. + trashed_list = [(ref, ninfo) for ref, ninfo in trashed if isinstance(ninfo, StoredFileInfo)] + + if artifacts_to_keep is None or any("#" in info[1].path for info in trashed_list): # The bridge is not helping us so have to work it out # ourselves. This is not going to be as efficient. - trashed = list(trashed) - - # The instance check is for mypy since up to this point it - # does not know the type of info. - path_map = self._refs_associated_with_artifacts( - [info.path for _, info in trashed if isinstance(info, StoredFileInfo)] - ) - - for ref, info in trashed: - # Mypy needs to know this is not the base class - assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" - - path_map[info.path].remove(ref.id) - if not path_map[info.path]: - del path_map[info.path] - - artifacts_to_keep = set(path_map) + # This mapping does not include the fragments. + if artifacts_to_keep is not None: + # This means we have already checked for non-fragment + # examples so can filter. + paths_to_check = {info.path for _, info in trashed_list if "#" in info.path} + else: + paths_to_check = {info.path for _, info in trashed_list} + + path_map = self._refs_associated_with_artifacts(paths_to_check) + + for ref, info in trashed_list: + path = info.artifact_path + # For disassembled composites in a Zip it is possible + # for the same path to correspond to the same dataset ref + # multiple times so trap for that. + if ref.id in path_map[path]: + path_map[path].remove(ref.id) + if not path_map[path]: + del path_map[path] + + slow_artifacts_to_keep = set(path_map) + if artifacts_to_keep is not None: + artifacts_to_keep.update(slow_artifacts_to_keep) + else: + artifacts_to_keep = slow_artifacts_to_keep - for ref, info in trashed: + for ref, info in trashed_list: # Should not happen for this implementation but need # to keep mypy happy. assert info is not None, f"Internal logic error in emptyTrash with ref {ref}." - # Mypy needs to know this is not the base class - assert isinstance(info, StoredFileInfo), f"Unexpectedly got info of class {type(info)}" - - if info.path in artifacts_to_keep: + if info.artifact_path in artifacts_to_keep: # This is a multi-dataset artifact and we are not # removing all associated refs. continue @@ -2940,7 +2980,6 @@ def export_records(self, refs: Iterable[DatasetIdRef]) -> Mapping[str, Datastore dataset_records.setdefault(self._table.name, []).append(info) record_data = DatastoreRecordData(records=records) - print("XXXCX: ", self.name, record_data) return {self.name: record_data} def set_retrieve_dataset_type_method(self, method: Callable[[str], DatasetType | None] | None) -> None: diff --git a/python/lsst/daf/butler/registry/opaque.py b/python/lsst/daf/butler/registry/opaque.py index 2062c448cc..09d1c18461 100644 --- a/python/lsst/daf/butler/registry/opaque.py +++ b/python/lsst/daf/butler/registry/opaque.py @@ -98,7 +98,10 @@ def replace(self, *data: dict, transaction: DatastoreTransaction | None = None) # the database itself providing any rollback functionality. self._db.replace(self._table, *data) - def fetch(self, **where: Any) -> Iterator[sqlalchemy.RowMapping]: + def fetch( + self, + **where: Any, + ) -> Iterator[sqlalchemy.RowMapping]: # Docstring inherited from OpaqueTableStorage. def _batch_in_clause( @@ -123,8 +126,12 @@ def _batch_in_clauses(**where: Any) -> Iterator[sqlalchemy.sql.expression.Column if isinstance(v, list | tuple | set): batches.append(_batch_in_clause(column, v)) else: - # single "batch" for a regular eq operator - batches.append([column == v]) + if isinstance(v, str) and v.endswith("%"): + # Special case prefix queries. + batches.append([column.startswith(v[:-1])]) + else: + # single "batch" for a regular eq operator + batches.append([column == v]) for clauses in itertools.product(*batches): yield sqlalchemy.sql.and_(*clauses) From b078e7b23dafe70a3deb6301486605ee6043813f Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 22 Oct 2024 12:58:40 -0700 Subject: [PATCH 11/39] Fragments can be quoted so have to look for unquoted fragment in zip path --- python/lsst/daf/butler/_formatter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/_formatter.py b/python/lsst/daf/butler/_formatter.py index 5cdd75fd82..96b4518605 100644 --- a/python/lsst/daf/butler/_formatter.py +++ b/python/lsst/daf/butler/_formatter.py @@ -439,8 +439,8 @@ def read( # direct read from URI option and the contents of the Zip file must # be extracted. uri = self.file_descriptor.location.uri - if uri.fragment and uri.fragment.startswith("zip-path="): - _, path_in_zip = uri.fragment.split("=") + if uri.fragment and uri.unquoted_fragment.startswith("zip-path="): + _, path_in_zip = uri.unquoted_fragment.split("=") # Open the Zip file using ResourcePath. with uri.open("rb") as fd: From b165dabbdfeb9e93df81508023dabe318966325e Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 22 Oct 2024 13:02:06 -0700 Subject: [PATCH 12/39] Add minimal constraints checking for zip ingest For now rejects if any refs are rejected. --- .../daf/butler/datastores/chainedDatastore.py | 40 ++++++++++++++++++- .../daf/butler/datastores/fileDatastore.py | 14 +++++-- 2 files changed, 48 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index 8f7ed8ceb6..a09984c3b7 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -38,7 +38,13 @@ from collections.abc import Callable, Collection, Iterable, Mapping, Sequence from typing import TYPE_CHECKING, Any -from lsst.daf.butler import DatasetId, DatasetRef, DatasetTypeNotSupportedError, FileDataset +from lsst.daf.butler import ( + DatasetId, + DatasetRef, + DatasetTypeNotSupportedError, + DimensionUniverse, + FileDataset, +) from lsst.daf.butler.datastore import ( DatasetRefURIs, Datastore, @@ -115,6 +121,9 @@ class ChainedDatastore(Datastore): datastoreConstraints: Sequence[Constraints | None] """Constraints to be applied to each of the child datastores.""" + universe: DimensionUniverse + """Dimension universe associated with the butler.""" + @classmethod def setConfigRoot(cls, root: str, config: Config, full: Config, overwrite: bool = True) -> None: """Set any filesystem-dependent config options for child Datastores to @@ -224,6 +233,8 @@ def __init__( else: self.datastoreConstraints = (None,) * len(self.datastores) + self.universe = bridgeManager.universe + log.debug("Created %s (%s)", self.name, ("ephemeral" if self.isEphemeral else "permanent")) @classmethod @@ -936,11 +947,36 @@ def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: where it is not supported. Is deemed successful if any of the datastores accept the file. """ + index = ZipIndex.from_zip_file(zip_path) + refs = index.refs.to_refs(self.universe) + + # For now raise if any refs are not supported. + # Being selective will require that we return the ingested refs + # to the caller so that registry can be modified to remove the + # entries. + if any(not self.constraints.isAcceptable(ref) for ref in refs): + raise DatasetTypeNotSupportedError( + "Some of the refs in the given Zip file are not acceptable to this datastore." + ) + n_success = 0 final_exception: Exception | None = None - for number, datastore in enumerate(self.datastores): + for number, (datastore, constraints) in enumerate( + zip(self.datastores, self.datastoreConstraints, strict=True) + ): if datastore.isEphemeral: continue + + # There can be constraints for the datastore in the configuration + # of the chaining, or constraints in the configuration of the + # datastore itself. + if any( + (constraints is not None and not constraints.isAcceptable(ref)) + or not datastore.constraints.isAcceptable(ref) + for ref in refs + ): + log.debug("Datastore %s skipping zip ingest due to constraints", datastore.name) + continue try: datastore.ingest_zip(zip_path, transfer=transfer) except NotImplementedError: diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index ecd11f82cb..d8f9f5822f 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2086,6 +2086,16 @@ def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: """ index = ZipIndex.from_zip_file(zip_path) + # Refs indexed by UUID. + refs = index.refs.to_refs(universe=self.universe) + id_to_ref = {ref.id: ref for ref in refs} + + # Any failing constraints trigger entire failure. + if any(not self.constraints.isAcceptable(ref) for ref in refs): + raise DatasetTypeNotSupportedError( + "Some refs in the Zip file are not supported by this datastore" + ) + # Transfer the Zip file into the datastore file system. # There is no RUN as such to use for naming. # Potentially could use the RUN from the first ref in the index @@ -2125,10 +2135,6 @@ def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: else: path_in_store = tgtLocation.pathInStore.path - # Refs indexed by UUID. - refs = index.refs.to_refs(universe=self.universe) - id_to_ref = {ref.id: ref for ref in refs} - # Associate each file with a (DatasetRef, StoredFileInfo) tuple. artifacts: list[tuple[DatasetRef, StoredFileInfo]] = [] for path_in_zip, serialized_info in index.info_map.items(): From 7fd3bcd0d87d86991766042d60f2b5a34547d2ac Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 22 Oct 2024 13:03:18 -0700 Subject: [PATCH 13/39] Add more diagnostics if JSON/YAML does not parse correctly --- python/lsst/daf/butler/formatters/json.py | 8 +++++++- python/lsst/daf/butler/formatters/yaml.py | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/formatters/json.py b/python/lsst/daf/butler/formatters/json.py index 298adc73c2..2599831c81 100644 --- a/python/lsst/daf/butler/formatters/json.py +++ b/python/lsst/daf/butler/formatters/json.py @@ -63,7 +63,13 @@ def read_from_uri(self, uri: ResourcePath, component: str | None = None, expecte data = pytype.model_validate_json(json_bytes) else: # This can raise ValueError. - data = from_json(json_bytes) + try: + data = from_json(json_bytes) + except ValueError as e: + # Report on the first few bytes of the file. + bytes_str = json_bytes[:60].decode(errors="replace") + e.add_note(f"Error parsing JSON bytes starting with {bytes_str!r}") + raise return data diff --git a/python/lsst/daf/butler/formatters/yaml.py b/python/lsst/daf/butler/formatters/yaml.py index 61706f4095..400c68a156 100644 --- a/python/lsst/daf/butler/formatters/yaml.py +++ b/python/lsst/daf/butler/formatters/yaml.py @@ -49,7 +49,14 @@ class YamlFormatter(TypelessFormatter): def read_from_uri(self, uri: ResourcePath, component: str | None = None, expected_size: int = -1) -> Any: # Can not use ResourcePath.open() - data = yaml.safe_load(uri.read()) + yaml_bytes = uri.read() + try: + data = yaml.safe_load(yaml_bytes) + except Exception as e: + # Report on the first few bytes of the file. + bytes_str = yaml_bytes[:60].decode(errors="replace") + e.add_note(f"Error parsing JSON bytes starting with {bytes_str!r}") + raise return data def to_bytes(self, in_memory_dataset: Any) -> bytes: From 244c9b606577c83e659862f7f471ca066dfdd822 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 22 Oct 2024 13:03:46 -0700 Subject: [PATCH 14/39] Add tests for ingest-zip --- tests/test_butler.py | 47 +++++++++++++++++++++++++++++++ tests/test_quantumBackedButler.py | 12 ++++++++ 2 files changed, 59 insertions(+) diff --git a/tests/test_butler.py b/tests/test_butler.py index d1cde82ef2..c654996509 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -923,6 +923,50 @@ def testPytypePutCoercion(self) -> None: test_dict3 = butler.get(this_type, dataId=dataId, visit=425) self.assertEqual(get_full_type_name(test_dict3), "dict") + def test_ingest_zip(self) -> None: + """Create butler, export data, delete data, import from Zip.""" + butler, dataset_type = self.create_butler( + run=self.default_run, storageClass="StructuredData", datasetTypeName="metrics" + ) + + metric = makeExampleMetrics() + refs = [] + for visit in (423, 424, 425): + ref = butler.put(metric, dataset_type, instrument="DummyCamComp", visit=visit) + refs.append(ref) + + # Retrieve a Zip file. + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdir: + zip = butler.retrieve_artifacts_zip(refs, destination=tmpdir) + + # Ingest will fail. + with self.assertRaises(ConflictingDefinitionError): + butler.ingest_zip(zip) + + # Clear out the collection. + butler.removeRuns([self.default_run], unstore=True) + self.assertFalse(butler.exists(refs[0])) + + butler.ingest_zip(zip, transfer="copy") + + # Check that the refs can be read again. + _ = [butler.get(ref) for ref in refs] + + uri = butler.getURI(refs[2]) + self.assertTrue(uri.exists()) + + # Delete one dataset. The Zip file should still exist and allow + # remaining refs to be read. + butler.pruneDatasets([refs[0]], purge=True, unstore=True) + self.assertTrue(uri.exists()) + + metric2 = butler.get(refs[1]) + self.assertEqual(metric2, metric, msg=f"{metric2} != {metric}") + + butler.removeRuns([self.default_run], unstore=True) + self.assertFalse(uri.exists()) + self.assertFalse(butler.exists(refs[-1])) + def testIngest(self) -> None: butler = self.create_empty_butler(run=self.default_run) @@ -2139,6 +2183,9 @@ class InMemoryDatastoreButlerTestCase(ButlerTests, unittest.TestCase): def testIngest(self) -> None: pass + def test_ingest_zip(self) -> None: + pass + class ClonedSqliteButlerTestCase(InMemoryDatastoreButlerTestCase, unittest.TestCase): """Test that a Butler with a Sqlite registry still works after cloning.""" diff --git a/tests/test_quantumBackedButler.py b/tests/test_quantumBackedButler.py index 984abc22e6..fd86ceaa09 100644 --- a/tests/test_quantumBackedButler.py +++ b/tests/test_quantumBackedButler.py @@ -27,6 +27,7 @@ import json import os +import tempfile import unittest import unittest.mock from typing import cast @@ -43,6 +44,7 @@ RegistryConfig, StorageClass, ) +from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ZipIndex from lsst.daf.butler.direct_butler import DirectButler from lsst.daf.butler.registry import _RegistryFactory from lsst.daf.butler.tests.utils import makeTestTempDir, removeTestTempDir @@ -218,6 +220,16 @@ def test_getput(self) -> None: self.assertEqual(qbb._actual_output_refs, set(self.output_refs)) + # Retrieve them as a Zip artifact. + with tempfile.TemporaryDirectory() as tmpdir: + zip = qbb.retrieve_artifacts_zip(self.output_refs, destination=tmpdir) + + index = ZipIndex.from_zip_file(zip) + zip_refs = index.refs.to_refs(universe=qbb.dimensions) + self.assertEqual(len(zip_refs), 4) + self.assertEqual(set(zip_refs), set(self.output_refs)) + self.assertEqual(len(index.info_map), 4) # Count number of artifacts in Zip. + def test_getDeferred(self) -> None: """Test for getDeferred method""" quantum = self.make_quantum() From fd40aebf30121397fd8140f8772bbec795df7ffb Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 23 Oct 2024 10:46:19 -0700 Subject: [PATCH 15/39] Add support for missing datastore records in trust mode for retrieveArtifacts This allows Zip retrieval to work in QBB mode. --- .../daf/butler/datastores/fileDatastore.py | 130 +++++++++++------- 1 file changed, 81 insertions(+), 49 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index d8f9f5822f..49d6bb8b4d 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -1976,6 +1976,71 @@ def _locations_to_URI( return uris + @staticmethod + def _find_missing_records( + datastore: FileDatastore, + refs: Iterable[DatasetRef], + missing_ids: set[DatasetId], + artifact_existence: dict[ResourcePath, bool] | None = None, + ) -> dict[DatasetId, list[StoredFileInfo]]: + if not missing_ids: + return {} + + if artifact_existence is None: + artifact_existence = {} + + found_records: dict[DatasetId, list[StoredFileInfo]] = defaultdict(list) + id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} + + # This should be chunked in case we end up having to check + # the file store since we need some log output to show + # progress. + for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): + records = {} + for missing in missing_ids_chunk: + # Ask the source datastore where the missing artifacts + # should be. An execution butler might not know about the + # artifacts even if they are there. + expected = datastore._get_expected_dataset_locations_info(id_to_ref[missing]) + records[missing] = [info for _, info in expected] + + # Call the mexist helper method in case we have not already + # checked these artifacts such that artifact_existence is + # empty. This allows us to benefit from parallelism. + # datastore.mexists() itself does not give us access to the + # derived datastore record. + log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) + ref_exists = datastore._process_mexists_records( + id_to_ref, records, False, artifact_existence=artifact_existence + ) + + # Now go through the records and propagate the ones that exist. + location_factory = datastore.locationFactory + for missing, record_list in records.items(): + # Skip completely if the ref does not exist. + ref = id_to_ref[missing] + if not ref_exists[ref]: + log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) + continue + # Check for file artifact to decide which parts of a + # disassembled composite do exist. If there is only a + # single record we don't even need to look because it can't + # be a composite and must exist. + if len(record_list) == 1: + dataset_records = record_list + else: + dataset_records = [ + record + for record in record_list + if artifact_existence[record.file_location(location_factory).uri] + ] + assert len(dataset_records) > 0, "Disassembled composite should have had some files." + + # Rely on source_records being a defaultdict. + found_records[missing].extend(dataset_records) + log.verbose("Completed scan for missing data files") + return found_records + def retrieveArtifacts( self, refs: Iterable[DatasetRef], @@ -2037,6 +2102,18 @@ def retrieveArtifacts( # Retrieve all the records in bulk indexed by ref.id. records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) + # Check for missing records. + known_ids = set(records) + log.debug("Number of datastore records found in database: %d", len(known_ids)) + requested_ids = {ref.id for ref in refs} + missing_ids = requested_ids - known_ids + + if missing_ids and not self.trustGetRequest: + raise ValueError(f"Number of datasets missing from this datastore: {len(missing_ids)}") + + missing_records = self._find_missing_records(self, refs, missing_ids) + records.update(missing_records) + # One artifact can be used by multiple DatasetRef. # e.g. DECam. artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) @@ -2605,55 +2682,10 @@ def transfer_from( len(missing_ids), len(requested_ids), ) - id_to_ref = {ref.id: ref for ref in refs if ref.id in missing_ids} - - # This should be chunked in case we end up having to check - # the file store since we need some log output to show - # progress. - for missing_ids_chunk in chunk_iterable(missing_ids, chunk_size=10_000): - records = {} - for missing in missing_ids_chunk: - # Ask the source datastore where the missing artifacts - # should be. An execution butler might not know about the - # artifacts even if they are there. - expected = source_datastore._get_expected_dataset_locations_info(id_to_ref[missing]) - records[missing] = [info for _, info in expected] - - # Call the mexist helper method in case we have not already - # checked these artifacts such that artifact_existence is - # empty. This allows us to benefit from parallelism. - # datastore.mexists() itself does not give us access to the - # derived datastore record. - log.verbose("Checking existence of %d datasets unknown to datastore", len(records)) - ref_exists = source_datastore._process_mexists_records( - id_to_ref, records, False, artifact_existence=artifact_existence - ) - - # Now go through the records and propagate the ones that exist. - location_factory = source_datastore.locationFactory - for missing, record_list in records.items(): - # Skip completely if the ref does not exist. - ref = id_to_ref[missing] - if not ref_exists[ref]: - log.warning("Asked to transfer dataset %s but no file artifacts exist for it.", ref) - continue - # Check for file artifact to decide which parts of a - # disassembled composite do exist. If there is only a - # single record we don't even need to look because it can't - # be a composite and must exist. - if len(record_list) == 1: - dataset_records = record_list - else: - dataset_records = [ - record - for record in record_list - if artifact_existence[record.file_location(location_factory).uri] - ] - assert len(dataset_records) > 0, "Disassembled composite should have had some files." - - # Rely on source_records being a defaultdict. - source_records[missing].extend(dataset_records) - log.verbose("Completed scan for missing data files") + found_records = self._find_missing_records( + source_datastore, refs, missing_ids, artifact_existence + ) + source_records.update(found_records) # See if we already have these records target_records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) From a53db144be8222a7d120042f18b810c677e6e383 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 23 Oct 2024 13:45:48 -0700 Subject: [PATCH 16/39] Add UUID prefix to each file placed in Zip This makes it easier to work out which file comes from which dataset ref and guarantees each file will be unique without directory path. --- python/lsst/daf/butler/datastore/_datastore.py | 6 ++++++ .../lsst/daf/butler/datastores/chainedDatastore.py | 5 +++++ python/lsst/daf/butler/datastores/fileDatastore.py | 12 +++++++++++- .../file_datastore/retrieve_artifacts.py | 14 +++++++++----- .../daf/butler/datastores/inMemoryDatastore.py | 5 +++++ .../daf/butler/remote_butler/_remote_butler.py | 4 +++- tests/test_cliCmdRetrieveArtifacts.py | 2 +- tests/test_remote_butler.py | 11 +++++++++++ 8 files changed, 51 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index 06c1e9731c..0285009e62 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -1025,6 +1025,7 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, write_index: bool = True, + add_prefix: bool = False, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the artifacts associated with the supplied refs. @@ -1051,6 +1052,10 @@ def retrieveArtifacts( If `True` write a file at the top level called ``_index.json`` containing a serialization of a `ZipIndex` for the downloaded datasets. + add_prefix : `bool`, optional + If `True` and if ``preserve_path`` is `False`, apply a prefix to + the filenames corresponding to some part of the dataset ref ID. + This can be used to guarantee uniqueness. Returns ------- @@ -1489,6 +1494,7 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, write_index: bool = True, + add_prefix: bool = False, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: raise NotImplementedError("This is a no-op datastore that can not access a real datastore") diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index a09984c3b7..e96be704c8 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -818,6 +818,7 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, write_index: bool = True, + add_prefix: bool = False, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. @@ -844,6 +845,9 @@ def retrieveArtifacts( If `True` write a file at the top level called ``_index.json`` containing a serialization of a `ZipIndex` for the downloaded datasets. + add_prefix : `bool`, optional + Add a prefix based on the DatasetId. Only used if ``preserve_path`` + is `False`. Returns ------- @@ -911,6 +915,7 @@ def retrieveArtifacts( preserve_path=preserve_path, overwrite=overwrite, write_index=False, # Disable index writing regardless. + add_prefix=add_prefix, ) targets.extend(retrieved) merged_artifacts_to_ref_id.update(artifacts_to_ref_id) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 49d6bb8b4d..cfcd4cc40a 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2049,6 +2049,7 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, write_index: bool = True, + add_prefix: bool = False, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. @@ -2075,6 +2076,10 @@ def retrieveArtifacts( If `True` write a file at the top level called ``_index.json`` containing a serialization of a `ZipIndex` for the downloaded datasets. + add_prefix : `bool`, optional + If `True` and if ``preserve_path`` is `False`, apply a prefix to + the filenames corresponding to some part of the dataset ref ID. + This can be used to guarantee uniqueness. Returns ------- @@ -2119,12 +2124,17 @@ def retrieveArtifacts( artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} for ref in refs: + prefix = str(ref.id)[:8] + "-" if add_prefix else "" for info in records[ref.id]: location = info.file_location(self.locationFactory) source_uri = location.uri target_uri = determine_destination_for_retrieved_artifact( - destination, location.pathInStore, preserve_path + destination, location.pathInStore, preserve_path, prefix ) + # This will override any previous target URI if a source file + # has multiple refs associated with it but that is okay. + # Any prefix used will be from the final ref and all refs + # must be recorded. to_transfer[source_uri] = target_uri artifact_to_ref_id[target_uri].append(ref.id) artifact_to_info[target_uri] = info diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index c20cec5551..f6cd61c154 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -215,7 +215,7 @@ def generate_uuid5(self) -> uuid.UUID: # - uuid5 from file paths and dataset refs. # Do not attempt to include file contents in UUID. # Start with uuid5 from file paths. - data = ",".join(self.info_map.keys()) + data = ",".join(sorted(self.info_map.keys())) # No need to come up with a different namespace. return uuid.uuid5(DatasetIdFactory.NS_UUID, data) @@ -347,7 +347,7 @@ def from_zip_file(cls, zip_path: ResourcePath) -> Self: def determine_destination_for_retrieved_artifact( - destination_directory: ResourcePath, source_path: ResourcePath, preserve_path: bool + destination_directory: ResourcePath, source_path: ResourcePath, preserve_path: bool, prefix: str = "" ) -> ResourcePath: """Determine destination path for an artifact retrieved from a datastore. @@ -358,10 +358,12 @@ def determine_destination_for_retrieved_artifact( source_path : `ResourcePath` Path to the source file to be transferred. This may be relative to the datastore root, or an absolute path. - preserve_path : `bool`, optional + preserve_path : `bool` If `True` the full path of the artifact within the datastore is preserved. If `False` the final file component of the path is used. + prefix : `str`, optional + Prefix to add to the file name if ``preserve_path`` is `False`. Returns ------- @@ -379,6 +381,8 @@ def determine_destination_for_retrieved_artifact( target_path = target_path.relativeToPathRoot else: target_path = source_path.basename() + if prefix: + target_path = prefix + target_path target_uri = destination_directory.join(target_path).abspath() if target_uri.relative_to(destination_directory) is None: @@ -390,7 +394,7 @@ def retrieve_and_zip( refs: Iterable[DatasetRef], destination: ResourcePathExpression, retrieval_callback: Callable[ - [Iterable[DatasetRef], ResourcePath, str, bool, bool, bool], + [Iterable[DatasetRef], ResourcePath, str, bool, bool, bool, bool], tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]], ], ) -> ResourcePath: @@ -432,7 +436,7 @@ def retrieve_and_zip( with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) # Retrieve the artifacts and write the index file. Strip paths. - paths, _, _ = retrieval_callback(refs, tmpdir_path, "auto", False, False, True) + paths, _, _ = retrieval_callback(refs, tmpdir_path, "auto", False, False, True, True) # Read the index to construct file name. index_path = tmpdir_path.join(ZipIndex.index_name, forceDirectory=False) diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index f66915bef6..a1513c717c 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -528,6 +528,7 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool | None = False, write_index: bool = True, + add_prefix: bool = False, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: """Retrieve the file artifacts associated with the supplied refs. @@ -554,6 +555,10 @@ def retrieveArtifacts( If `True` write a file at the top level called ``_index.json`` containing a serialization of a `ZipIndex` for the downloaded datasets. + add_prefix : `bool`, optional + If `True` and if ``preserve_path`` is `False`, apply a prefix to + the filenames corresponding to some part of the dataset ref ID. + This can be used to guarantee uniqueness. Notes ----- diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 4076b5ab2e..350df44420 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -426,6 +426,7 @@ def _retrieve_artifacts( preserve_path: bool = True, overwrite: bool = False, write_index: bool = True, + add_prefix: bool = False, ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: destination = ResourcePath(destination).abspath() if not destination.isdir(): @@ -438,12 +439,13 @@ def _retrieve_artifacts( artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} output_uris: list[ResourcePath] = [] for ref in refs: + prefix = str(ref.id)[:8] + "-" if add_prefix else "" file_info = _to_file_payload(self._get_file_info_for_ref(ref)).file_info for file in file_info: source_uri = ResourcePath(str(file.url)) relative_path = ResourcePath(file.datastoreRecords.path, forceAbsolute=False) target_uri = determine_destination_for_retrieved_artifact( - destination, relative_path, preserve_path + destination, relative_path, preserve_path, prefix ) # Because signed URLs expire, we want to do the transfer soon # after retrieving the URL. diff --git a/tests/test_cliCmdRetrieveArtifacts.py b/tests/test_cliCmdRetrieveArtifacts.py index 2ff7a5b250..28690816dd 100644 --- a/tests/test_cliCmdRetrieveArtifacts.py +++ b/tests/test_cliCmdRetrieveArtifacts.py @@ -113,7 +113,7 @@ def testRetrieveAsZip(self): ], ) self.assertEqual(result.exit_code, 0, clickResultMsg(result)) - self.assertIn("2ac4dd89-17ac-5354-9c15-cfec8dab5b3b.zip", result.stdout) + self.assertIn(".zip", result.stdout) artifacts = self.find_files(destdir) self.assertEqual(len(artifacts), 1, f"Expected one zip file: {artifacts}") diff --git a/tests/test_remote_butler.py b/tests/test_remote_butler.py index e1ed3522c2..891d40b868 100644 --- a/tests/test_remote_butler.py +++ b/tests/test_remote_butler.py @@ -138,6 +138,17 @@ def test_retrieve_artifacts_security(self): ResourcePath("/tmp/output_directory/not/relative.txt"), ) + # Test prefixing. + self.assertEqual( + determine_destination_for_retrieved_artifact( + ResourcePath("/tmp/output_directory/"), + ResourcePath("file:///not/relative.txt"), + preserve_path=False, + prefix="prefix-", + ), + ResourcePath("/tmp/output_directory/prefix-relative.txt"), + ) + class RemoteButlerRegistryTests(RegistryTests): """Tests for RemoteButler's `Registry` shim.""" From 05bc923ffbc98a118c22cd4f852e3491147d6ffc Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 23 Oct 2024 14:47:59 -0700 Subject: [PATCH 17/39] Attempt to prevent repeated copying of the same file for DECam/Zips This does not fix the problem of retrieve-artifacts retrieving Zip files and ending up with a bad index. --- .../daf/butler/datastores/fileDatastore.py | 8 ++++-- .../butler/remote_butler/_remote_butler.py | 27 ++++++++++++------- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index cfcd4cc40a..b374a8b2be 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2134,9 +2134,13 @@ def retrieveArtifacts( # This will override any previous target URI if a source file # has multiple refs associated with it but that is okay. # Any prefix used will be from the final ref and all refs - # must be recorded. - to_transfer[source_uri] = target_uri + # must be recorded. Fragments have to be removed from the + # transfer list though to prevent duplicate copies. + cleaned_source_uri = source_uri.replace(fragment="", query="", params="") + to_transfer[cleaned_source_uri] = target_uri artifact_to_ref_id[target_uri].append(ref.id) + # TODO: If this is a Zip file, it should be unzipped and the + # index merged. Else only a single info record is recorded. artifact_to_info[target_uri] = info # In theory can now parallelize the transfer diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 350df44420..4946308f82 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -438,21 +438,30 @@ def _retrieve_artifacts( artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} output_uris: list[ResourcePath] = [] + have_copied: dict[ResourcePath, ResourcePath] = {} for ref in refs: prefix = str(ref.id)[:8] + "-" if add_prefix else "" file_info = _to_file_payload(self._get_file_info_for_ref(ref)).file_info for file in file_info: source_uri = ResourcePath(str(file.url)) - relative_path = ResourcePath(file.datastoreRecords.path, forceAbsolute=False) - target_uri = determine_destination_for_retrieved_artifact( - destination, relative_path, preserve_path, prefix - ) - # Because signed URLs expire, we want to do the transfer soon - # after retrieving the URL. - target_uri.transfer_from(source_uri, transfer="copy", overwrite=overwrite) - output_uris.append(target_uri) - artifact_to_ref_id[target_uri].append(ref.id) + # For decam/zip situation we only want to copy once. + cleaned_source_uri = source_uri.replace(fragment="", query="", params="") + if cleaned_source_uri not in have_copied: + relative_path = ResourcePath(file.datastoreRecords.path, forceAbsolute=False) + target_uri = determine_destination_for_retrieved_artifact( + destination, relative_path, preserve_path, prefix + ) + # Because signed URLs expire, we want to do the transfer + # son after retrieving the URL. + target_uri.transfer_from(source_uri, transfer="copy", overwrite=overwrite) + output_uris.append(target_uri) + have_copied[cleaned_source_uri] = target_uri + else: + target_uri = have_copied[cleaned_source_uri] + # TODO: Fix this with Zip files that need to be unzipped + # on retrieval and index merged. artifact_to_info[target_uri] = StoredFileInfo.from_simple(file.datastoreRecords) + artifact_to_ref_id[target_uri].append(ref.id) if write_index: index = ZipIndex.from_artifact_maps(refs, artifact_to_ref_id, artifact_to_info, destination) From 9ef21fe10253d3cfac001e48ec18a3ad37a6a4b0 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 25 Oct 2024 14:20:24 -0700 Subject: [PATCH 18/39] Minor doc string fixes Co-authored-by: Andy Salnikov --- python/lsst/daf/butler/_butler.py | 2 +- python/lsst/daf/butler/_quantum_backed.py | 2 +- .../daf/butler/datastores/file_datastore/retrieve_artifacts.py | 2 +- python/lsst/daf/butler/remote_butler/_remote_butler.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index 19341e9ed2..edd7bbe0e0 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -997,7 +997,7 @@ def retrieve_artifacts_zip( Parameters ---------- - refs : `collections.abc.Iterable` [ `DatasetRef` ] + refs : `~collections.abc.Iterable` [ `DatasetRef` ] The datasets to be included in the zip file. Must all be from the same dataset type. destination : `lsst.resources.ResourcePathExpression` diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py index 9176b752d1..968cc6f84a 100644 --- a/python/lsst/daf/butler/_quantum_backed.py +++ b/python/lsst/daf/butler/_quantum_backed.py @@ -508,7 +508,7 @@ def retrieve_artifacts_zip( Parameters ---------- - refs : `collections.abc.Iterable` [ `DatasetRef` ] + refs : `~collections.abc.Iterable` [ `DatasetRef` ] The datasets to be included in the zip file. Must all be from the same dataset type. destination : `lsst.resources.ResourcePathExpression` diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index f6cd61c154..9805d1c47b 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -146,7 +146,7 @@ def to_refs(self, universe: DimensionUniverse) -> list[DatasetRef]: if universe.namespace != self.universe_namespace: raise RuntimeError( - f"Can not convert to refs in universe {universe.namespace} that were created frp, " + f"Can not convert to refs in universe {universe.namespace} that were created from " f"universe {self.universe_namespace}" ) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 4946308f82..2fef25a7bd 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -452,7 +452,7 @@ def _retrieve_artifacts( destination, relative_path, preserve_path, prefix ) # Because signed URLs expire, we want to do the transfer - # son after retrieving the URL. + # soon after retrieving the URL. target_uri.transfer_from(source_uri, transfer="copy", overwrite=overwrite) output_uris.append(target_uri) have_copied[cleaned_source_uri] = target_uri From 24c470122a93b0900418c5515b95a411ed285bd9 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 25 Oct 2024 14:58:02 -0700 Subject: [PATCH 19/39] Minor tweaks from review --- python/lsst/daf/butler/_butler.py | 8 ++++++-- python/lsst/daf/butler/_quantum_backed.py | 8 ++++++-- python/lsst/daf/butler/cli/cmd/commands.py | 8 +++++--- python/lsst/daf/butler/datastore/_datastore.py | 2 +- .../datastores/file_datastore/retrieve_artifacts.py | 8 ++++++++ tests/test_butler.py | 3 +++ 6 files changed, 29 insertions(+), 8 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index edd7bbe0e0..c45fec258a 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -998,8 +998,7 @@ def retrieve_artifacts_zip( Parameters ---------- refs : `~collections.abc.Iterable` [ `DatasetRef` ] - The datasets to be included in the zip file. Must all be from - the same dataset type. + The datasets to be included in the zip file. destination : `lsst.resources.ResourcePathExpression` Directory to write the new ZIP file. This directory will also be used as a staging area for the datasets being downloaded @@ -1009,6 +1008,11 @@ def retrieve_artifacts_zip( ------- zip_file : `lsst.resources.ResourcePath` The path to the new ZIP file. + + Raises + ------ + ValueError + Raised if there are no refs to retrieve. """ raise NotImplementedError() diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py index 968cc6f84a..731e5a6183 100644 --- a/python/lsst/daf/butler/_quantum_backed.py +++ b/python/lsst/daf/butler/_quantum_backed.py @@ -509,8 +509,7 @@ def retrieve_artifacts_zip( Parameters ---------- refs : `~collections.abc.Iterable` [ `DatasetRef` ] - The datasets to be included in the zip file. Must all be from - the same dataset type. + The datasets to be included in the zip file. destination : `lsst.resources.ResourcePathExpression` Directory to write the new ZIP file. This directory will also be used as a staging area for the datasets being downloaded @@ -520,6 +519,11 @@ def retrieve_artifacts_zip( ------- zip_file : `lsst.resources.ResourcePath` The path to the new ZIP file. + + Raises + ------ + ValueError + Raised if there are no refs to retrieve. """ return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts) diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py index 804dfcbaa1..8b55614115 100644 --- a/python/lsst/daf/butler/cli/cmd/commands.py +++ b/python/lsst/daf/butler/cli/cmd/commands.py @@ -613,10 +613,12 @@ def retrieve_artifacts(**kwargs: Any) -> None: """Retrieve file artifacts associated with datasets in a repository.""" verbose = kwargs.pop("verbose") transferred = script.retrieveArtifacts(**kwargs) - if kwargs["zip"] and transferred: - print(f"Zip files written to {transferred[0]}") + if not transferred: + print("No datasets matched query.") + elif kwargs["zip"]: + print(f"Zip file written to {transferred[0]}") else: - if verbose and transferred: + if verbose: print(f"Transferred the following to {kwargs['destination']}:") for uri in transferred: print(uri) diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index 0285009e62..27e3ee75cd 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -1066,7 +1066,7 @@ def retrieveArtifacts( `list` [ `uuid.UUID` ] ] Mapping of retrieved artifact path to DatasetRef ID. artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ - `StoredDatastoreItemInfo` ] + `StoredFileInfo` ] Mapping of retrieved artifact path to datastore record information. Notes diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 9805d1c47b..91d801a957 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -418,7 +418,15 @@ def retrieve_and_zip( ------- zip_file : `lsst.resources.ResourcePath` The path to the new ZIP file. + + Raises + ------ + ValueError + Raised if there are no refs to retrieve. """ + if not refs: + raise ValueError("Requested Zip file with no contents.") + outdir = ResourcePath(destination, forceDirectory=True) if not outdir.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}") diff --git a/tests/test_butler.py b/tests/test_butler.py index c654996509..f498f31f1c 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -967,6 +967,9 @@ def test_ingest_zip(self) -> None: self.assertFalse(uri.exists()) self.assertFalse(butler.exists(refs[-1])) + with self.assertRaises(ValueError): + butler.retrieve_artifacts_zip([], destination=".") + def testIngest(self) -> None: butler = self.create_empty_butler(run=self.default_run) From d06f36a2d32ab8b6eee92f698cdf38d7eddfdf42 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 25 Oct 2024 15:11:00 -0700 Subject: [PATCH 20/39] Rename Zip index file name And remove some duplicated code that was writing the index a second time. --- python/lsst/daf/butler/datastore/_datastore.py | 5 ++--- python/lsst/daf/butler/datastores/chainedDatastore.py | 5 ++--- python/lsst/daf/butler/datastores/fileDatastore.py | 5 ++--- .../datastores/file_datastore/retrieve_artifacts.py | 2 +- python/lsst/daf/butler/datastores/inMemoryDatastore.py | 5 ++--- python/lsst/daf/butler/direct_butler/_direct_butler.py | 10 ++-------- tests/test_butler.py | 3 ++- 7 files changed, 13 insertions(+), 22 deletions(-) diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index 27e3ee75cd..fcdfabc3d5 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -1049,9 +1049,8 @@ def retrieveArtifacts( If `True` allow transfers to overwrite existing files at the destination. write_index : `bool`, optional - If `True` write a file at the top level called ``_index.json`` - containing a serialization of a `ZipIndex` for the downloaded - datasets. + If `True` write a file at the top level containing a serialization + of a `ZipIndex` for the downloaded datasets. add_prefix : `bool`, optional If `True` and if ``preserve_path`` is `False`, apply a prefix to the filenames corresponding to some part of the dataset ref ID. diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index e96be704c8..5e34dc862c 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -842,9 +842,8 @@ def retrieveArtifacts( If `True` allow transfers to overwrite existing files at the destination. write_index : `bool`, optional - If `True` write a file at the top level called ``_index.json`` - containing a serialization of a `ZipIndex` for the downloaded - datasets. + If `True` write a file at the top level containing a serialization + of a `ZipIndex` for the downloaded datasets. add_prefix : `bool`, optional Add a prefix based on the DatasetId. Only used if ``preserve_path`` is `False`. diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index b374a8b2be..67a1a58381 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2073,9 +2073,8 @@ def retrieveArtifacts( If `True` allow transfers to overwrite existing files at the destination. write_index : `bool`, optional - If `True` write a file at the top level called ``_index.json`` - containing a serialization of a `ZipIndex` for the downloaded - datasets. + If `True` write a file at the top level containing a serialization + of a `ZipIndex` for the downloaded datasets. add_prefix : `bool`, optional If `True` and if ``preserve_path`` is `False`, apply a prefix to the filenames corresponding to some part of the dataset ref ID. diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 91d801a957..de6cc4fe65 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -195,7 +195,7 @@ class ZipIndex(BaseModel): info_map: dict[str, SerializedStoredFileInfo] """Mapping of each Zip member to the associated datastore record.""" - index_name: ClassVar[str] = "_index.json" + index_name: ClassVar[str] = "_butler_zip_index.json" """Name to use when saving the index to a file.""" def generate_uuid5(self) -> uuid.UUID: diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index a1513c717c..7cb4717a13 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -552,9 +552,8 @@ def retrieveArtifacts( If `True` allow transfers to overwrite existing files at the destination. write_index : `bool`, optional - If `True` write a file at the top level called ``_index.json`` - containing a serialization of a `ZipIndex` for the downloaded - datasets. + If `True` write a file at the top level containing a serialization + of a `ZipIndex` for the downloaded datasets. add_prefix : `bool`, optional If `True` and if ``preserve_path`` is `False`, apply a prefix to the filenames corresponding to some part of the dataset ref ID. diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 88715803a2..946fd25e10 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -1309,20 +1309,14 @@ def retrieveArtifacts( ) -> list[ResourcePath]: # Docstring inherited. outdir = ResourcePath(destination) - paths, id_map, info_map = self._datastore.retrieveArtifacts( + paths, _, _ = self._datastore.retrieveArtifacts( refs, outdir, transfer=transfer, preserve_path=preserve_path, overwrite=overwrite, + write_index=True, ) - # Write the index file. - index = ZipIndex.from_artifact_maps( - refs, id_map, info_map, ResourcePath(destination, forceDirectory=True) - ) - index_path = outdir.join("_index.json") - with index_path.open("w") as fd: - print(index.model_dump_json(exclude_defaults=True, exclude_unset=True), file=fd) return paths def exists( diff --git a/tests/test_butler.py b/tests/test_butler.py index f498f31f1c..9f24d6ee85 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -88,6 +88,7 @@ def mock_aws(*args: Any, **kwargs: Any) -> Any: # type: ignore[no-untyped-def] ) from lsst.daf.butler.datastore import NullDatastore from lsst.daf.butler.datastore.file_templates import FileTemplate, FileTemplateValidationError +from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ZipIndex from lsst.daf.butler.datastores.fileDatastore import FileDatastore from lsst.daf.butler.direct_butler import DirectButler from lsst.daf.butler.registry import ( @@ -362,7 +363,7 @@ def runPutGetTest(self, storageClass: StorageClass, datasetTypeName: str) -> But self.assertGreater(len(transferred), 0) artifacts = list(ResourcePath.findFileResources([destination])) # Filter out the index file. - artifacts = [a for a in artifacts if a.basename() != "_index.json"] + artifacts = [a for a in artifacts if a.basename() != ZipIndex.index_name] self.assertEqual(set(transferred), set(artifacts)) for artifact in transferred: From 372d54c63e7c5c2aad0d3db23619e4b88cac499d Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 25 Oct 2024 15:22:03 -0700 Subject: [PATCH 21/39] Use partition rather than split --- python/lsst/daf/butler/_formatter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/_formatter.py b/python/lsst/daf/butler/_formatter.py index 96b4518605..e0378aa301 100644 --- a/python/lsst/daf/butler/_formatter.py +++ b/python/lsst/daf/butler/_formatter.py @@ -440,7 +440,7 @@ def read( # be extracted. uri = self.file_descriptor.location.uri if uri.fragment and uri.unquoted_fragment.startswith("zip-path="): - _, path_in_zip = uri.unquoted_fragment.split("=") + _, _, path_in_zip = uri.unquoted_fragment.partition("=") # Open the Zip file using ResourcePath. with uri.open("rb") as fd: From 16afb6120a39dc8ac224be43ca53fb5a355ff6d4 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Fri, 25 Oct 2024 16:06:01 -0700 Subject: [PATCH 22/39] Fix file datastore retrieve artifacts index for DECam data --- .../daf/butler/datastores/fileDatastore.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 67a1a58381..b82556f27a 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2122,21 +2122,23 @@ def retrieveArtifacts( # e.g. DECam. artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} + have_copied: dict[ResourcePath, ResourcePath] = {} for ref in refs: prefix = str(ref.id)[:8] + "-" if add_prefix else "" for info in records[ref.id]: location = info.file_location(self.locationFactory) source_uri = location.uri - target_uri = determine_destination_for_retrieved_artifact( - destination, location.pathInStore, preserve_path, prefix - ) - # This will override any previous target URI if a source file - # has multiple refs associated with it but that is okay. - # Any prefix used will be from the final ref and all refs - # must be recorded. Fragments have to be removed from the - # transfer list though to prevent duplicate copies. + # For DECam/zip we only want to copy once. + # We need to remove fragments for consistency. cleaned_source_uri = source_uri.replace(fragment="", query="", params="") - to_transfer[cleaned_source_uri] = target_uri + if cleaned_source_uri not in have_copied: + target_uri = determine_destination_for_retrieved_artifact( + destination, location.pathInStore, preserve_path, prefix + ) + to_transfer[cleaned_source_uri] = target_uri + have_copied[cleaned_source_uri] = target_uri + else: + target_uri = have_copied[cleaned_source_uri] artifact_to_ref_id[target_uri].append(ref.id) # TODO: If this is a Zip file, it should be unzipped and the # index merged. Else only a single info record is recorded. From 81412486dd94683045bc1833b03c78804e44311b Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Sat, 26 Oct 2024 09:56:43 -0700 Subject: [PATCH 23/39] Provide defaults for checksum and component This allows pydantic to drop them from the JSON. --- python/lsst/daf/butler/datastore/stored_file_info.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/datastore/stored_file_info.py b/python/lsst/daf/butler/datastore/stored_file_info.py index 5dbc3f6711..513d81e926 100644 --- a/python/lsst/daf/butler/datastore/stored_file_info.py +++ b/python/lsst/daf/butler/datastore/stored_file_info.py @@ -385,11 +385,11 @@ class SerializedStoredFileInfo(pydantic.BaseModel): storage_class: str """Name of the StorageClass associated with Dataset.""" - component: str | None + component: str | None = None """Component associated with this file. Can be `None` if the file does not refer to a component of a composite.""" - checksum: str | None + checksum: str | None = None """Checksum of the serialized dataset.""" file_size: int From 0e8af263c56a65e8bbfdfb9d7296b28ce26636be Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Sat, 26 Oct 2024 10:10:42 -0700 Subject: [PATCH 24/39] Reorganize index to only have one dict for path in zip --- .../lsst/daf/butler/datastore/_datastore.py | 17 ++-- .../daf/butler/datastores/chainedDatastore.py | 36 +++----- .../daf/butler/datastores/fileDatastore.py | 40 ++++----- .../file_datastore/retrieve_artifacts.py | 84 +++++++++++-------- .../butler/datastores/inMemoryDatastore.py | 4 +- .../butler/direct_butler/_direct_butler.py | 8 +- .../butler/remote_butler/_remote_butler.py | 21 ++--- tests/test_quantumBackedButler.py | 2 +- 8 files changed, 102 insertions(+), 110 deletions(-) diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index fcdfabc3d5..d3de5fedda 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -55,16 +55,16 @@ from .._file_dataset import FileDataset from .._storage_class import StorageClassFactory from .constraints import Constraints -from .stored_file_info import StoredFileInfo if TYPE_CHECKING: from lsst.resources import ResourcePath, ResourcePathExpression from .. import ddl from .._config_support import LookupKey - from .._dataset_ref import DatasetId, DatasetRef + from .._dataset_ref import DatasetRef from .._dataset_type import DatasetType from .._storage_class import StorageClass + from ..datastores.file_datastore.retrieve_artifacts import ArtifactIndexInfo from ..registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager from .record_data import DatastoreRecordData from .stored_file_info import StoredDatastoreItemInfo @@ -1026,7 +1026,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: """Retrieve the artifacts associated with the supplied refs. Parameters @@ -1061,12 +1061,9 @@ def retrieveArtifacts( targets : `list` of `lsst.resources.ResourcePath` URIs of file artifacts in destination location. Order is not preserved. - artifacts_to_ref_id : `dict` [ `~lsst.resources.ResourcePath`, \ - `list` [ `uuid.UUID` ] ] - Mapping of retrieved artifact path to DatasetRef ID. - artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ - `StoredFileInfo` ] - Mapping of retrieved artifact path to datastore record information. + artifact_map : `dict` [ `lsst.resources.ResourcePath`, \ + `ArtifactIndexInfo` ] + Mapping of retrieved file to associated index information. Notes ----- @@ -1494,7 +1491,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: raise NotImplementedError("This is a no-op datastore that can not access a real datastore") def remove(self, datasetRef: DatasetRef) -> None: diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index 5e34dc862c..1ad8b637a9 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -38,13 +38,7 @@ from collections.abc import Callable, Collection, Iterable, Mapping, Sequence from typing import TYPE_CHECKING, Any -from lsst.daf.butler import ( - DatasetId, - DatasetRef, - DatasetTypeNotSupportedError, - DimensionUniverse, - FileDataset, -) +from lsst.daf.butler import DatasetRef, DatasetTypeNotSupportedError, DimensionUniverse, FileDataset from lsst.daf.butler.datastore import ( DatasetRefURIs, Datastore, @@ -54,8 +48,7 @@ ) from lsst.daf.butler.datastore.constraints import Constraints from lsst.daf.butler.datastore.record_data import DatastoreRecordData -from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo -from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ZipIndex +from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ArtifactIndexInfo, ZipIndex from lsst.resources import ResourcePath from lsst.utils import doImportType @@ -819,7 +812,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: """Retrieve the file artifacts associated with the supplied refs. Parameters @@ -853,12 +846,9 @@ def retrieveArtifacts( targets : `list` of `lsst.resources.ResourcePath` URIs of file artifacts in destination location. Order is not preserved. - artifacts_to_ref_id : `dict` [ `~lsst.resources.ResourcePath`, \ - `list` [ `uuid.UUID` ] ] - Mapping of retrieved artifact path to `DatasetRef` ID. - artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ - `StoredDatastoreItemInfo` ] - Mapping of retrieved artifact path to datastore record information. + artifact_map : `dict` [ `lsst.resources.ResourcePath`, \ + `ArtifactIndexInfo` ] + Mapping of retrieved file to associated index information. """ if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}") @@ -904,10 +894,9 @@ def retrieveArtifacts( # Now do the transfer. targets: list[ResourcePath] = [] - merged_artifacts_to_ref_id: dict[ResourcePath, list[DatasetId]] = {} - merged_artifacts_to_info: dict[ResourcePath, StoredFileInfo] = {} + merged_artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} for number, datastore_refs in grouped_by_datastore.items(): - retrieved, artifacts_to_ref_id, artifacts_to_info = self.datastores[number].retrieveArtifacts( + retrieved, artifact_map = self.datastores[number].retrieveArtifacts( datastore_refs, destination, transfer=transfer, @@ -917,16 +906,13 @@ def retrieveArtifacts( add_prefix=add_prefix, ) targets.extend(retrieved) - merged_artifacts_to_ref_id.update(artifacts_to_ref_id) - merged_artifacts_to_info.update(artifacts_to_info) + merged_artifact_map.update(artifact_map) if write_index: - index = ZipIndex.from_artifact_maps( - refs, merged_artifacts_to_ref_id, merged_artifacts_to_info, destination - ) + index = ZipIndex.from_artifact_map(refs, merged_artifact_map, destination) index.write_index(destination) - return targets, merged_artifacts_to_ref_id, merged_artifacts_to_info + return targets, merged_artifact_map def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: """Ingest an indexed Zip file and contents. diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index b82556f27a..3dd3173879 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -81,6 +81,7 @@ get_dataset_as_python_object_from_get_info, ) from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ( + ArtifactIndexInfo, ZipIndex, determine_destination_for_retrieved_artifact, ) @@ -2050,7 +2051,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: """Retrieve the file artifacts associated with the supplied refs. Parameters @@ -2085,12 +2086,9 @@ def retrieveArtifacts( targets : `list` of `lsst.resources.ResourcePath` URIs of file artifacts in destination location. Order is not preserved. - artifacts_to_ref_id : `dict` [ `~lsst.resources.ResourcePath`, \ - `list` [ `uuid.UUID` ] ] - Mapping of retrieved artifact path to `DatasetRef` ID. - artifacts_to_info : `dict` [ `~lsst.resources.ResourcePath`, \ - `StoredDatastoreItemInfo` ] - Mapping of retrieved artifact path to datastore record information. + artifact_map : `dict` [ `lsst.resources.ResourcePath`, \ + `ArtifactIndexInfo` ] + Mapping of retrieved file to associated index information. """ if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}") @@ -2120,9 +2118,7 @@ def retrieveArtifacts( # One artifact can be used by multiple DatasetRef. # e.g. DECam. - artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) - artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} - have_copied: dict[ResourcePath, ResourcePath] = {} + artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} for ref in refs: prefix = str(ref.id)[:8] + "-" if add_prefix else "" for info in records[ref.id]: @@ -2130,19 +2126,17 @@ def retrieveArtifacts( source_uri = location.uri # For DECam/zip we only want to copy once. # We need to remove fragments for consistency. + # TODO: Unzip zip files on retrieval and merge indexes. cleaned_source_uri = source_uri.replace(fragment="", query="", params="") - if cleaned_source_uri not in have_copied: + if cleaned_source_uri not in to_transfer: target_uri = determine_destination_for_retrieved_artifact( destination, location.pathInStore, preserve_path, prefix ) to_transfer[cleaned_source_uri] = target_uri - have_copied[cleaned_source_uri] = target_uri + artifact_map[target_uri] = ArtifactIndexInfo.from_single(info.to_simple(), ref.id) else: - target_uri = have_copied[cleaned_source_uri] - artifact_to_ref_id[target_uri].append(ref.id) - # TODO: If this is a Zip file, it should be unzipped and the - # index merged. Else only a single info record is recorded. - artifact_to_info[target_uri] = info + target_uri = to_transfer[cleaned_source_uri] + artifact_map[target_uri].append(ref.id) # In theory can now parallelize the transfer log.debug("Number of artifacts to transfer to %s: %d", str(destination), len(to_transfer)) @@ -2150,10 +2144,10 @@ def retrieveArtifacts( target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) if write_index: - index = ZipIndex.from_artifact_maps(refs, artifact_to_ref_id, artifact_to_info, destination) + index = ZipIndex.from_artifact_map(refs, artifact_map, destination) index.write_index(destination) - return list(to_transfer.values()), artifact_to_ref_id, artifact_to_info + return list(to_transfer.values()), artifact_map def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: """Ingest an indexed Zip file and contents. @@ -2229,13 +2223,13 @@ def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: # Associate each file with a (DatasetRef, StoredFileInfo) tuple. artifacts: list[tuple[DatasetRef, StoredFileInfo]] = [] - for path_in_zip, serialized_info in index.info_map.items(): + for path_in_zip, index_info in index.artifact_map.items(): # Need to modify the info to include the path to the Zip file # that was previously written to the datastore. - serialized_info.path = f"{path_in_store}#zip-path={path_in_zip}" + index_info.info.path = f"{path_in_store}#zip-path={path_in_zip}" - info = StoredFileInfo.from_simple(serialized_info) - for id_ in index.ref_map[path_in_zip]: + info = StoredFileInfo.from_simple(index_info.info) + for id_ in index_info.ids: artifacts.append((id_to_ref[id_], info)) self._register_datasets(artifacts, insert_mode=DatabaseInsertMode.INSERT) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index de6cc4fe65..85752d870f 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -36,8 +36,8 @@ from collections.abc import Callable, Iterable from typing import ClassVar, Self -from lsst.daf.butler import DatasetId, DatasetIdFactory, DatasetRef -from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo, StoredFileInfo +from lsst.daf.butler import DatasetIdFactory, DatasetRef +from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo from lsst.resources import ResourcePath, ResourcePathExpression from pydantic import BaseModel @@ -177,6 +177,39 @@ def to_refs(self, universe: DimensionUniverse) -> list[DatasetRef]: return refs +class ArtifactIndexInfo(BaseModel): + """Information related to an artifact in an index.""" + + info: SerializedStoredFileInfo + """Datastore record information for this file artifact.""" + + ids: list[uuid.UUID] + """Dataset IDs for this artifact.""" + + def append(self, id_: uuid.UUID) -> None: + """Add an additional dataset ID. + + Parameters + ---------- + id_ : `uuid.UUID` + Additional dataset ID to associate with this artifact. + """ + self.ids.append(id_) + + @classmethod + def from_single(cls, info: SerializedStoredFileInfo, id_: uuid.UUID) -> Self: + """Create a mapping from a single ID and info. + + Parameters + ---------- + info : `SerializedStoredFileInfo` + Datastore record for this artifact. + id_ : `uuid.UUID` + First dataset ID to associate with this artifact. + """ + return cls(info=info, ids=[id_]) + + class ZipIndex(BaseModel): """Index of a Zip file of Butler datasets. @@ -189,11 +222,8 @@ class ZipIndex(BaseModel): refs: SerializedDatasetRefContainer """Deduplicated information for all the `DatasetRef` in the index.""" - ref_map: dict[str, list[uuid.UUID]] - """Mapping of Zip member to one or more dataset UUIDs.""" - - info_map: dict[str, SerializedStoredFileInfo] - """Mapping of each Zip member to the associated datastore record.""" + artifact_map: dict[str, ArtifactIndexInfo] + """Mapping of each Zip member to associated lookup information.""" index_name: ClassVar[str] = "_butler_zip_index.json" """Name to use when saving the index to a file.""" @@ -215,13 +245,13 @@ def generate_uuid5(self) -> uuid.UUID: # - uuid5 from file paths and dataset refs. # Do not attempt to include file contents in UUID. # Start with uuid5 from file paths. - data = ",".join(sorted(self.info_map.keys())) + data = ",".join(sorted(self.artifact_map.keys())) # No need to come up with a different namespace. return uuid.uuid5(DatasetIdFactory.NS_UUID, data) def __len__(self) -> int: """Return the number of files in the Zip.""" - return len(self.info_map) + return len(self.artifact_map) def calculate_zip_file_name(self) -> str: """Calculate the default name for the Zip file based on the index @@ -280,11 +310,10 @@ def calc_relative_paths( return file_to_relative @classmethod - def from_artifact_maps( + def from_artifact_map( cls, refs: Iterable[DatasetRef], - id_map: dict[ResourcePath, list[DatasetId]], - info_map: dict[ResourcePath, StoredFileInfo], + artifact_map: dict[ResourcePath, ArtifactIndexInfo], root: ResourcePath, ) -> Self: """Create an index from the mappings returned from @@ -294,42 +323,31 @@ def from_artifact_maps( ---------- refs : `~collections.abc.Iterable` [ `DatasetRef` ] Datasets present in the index. - id_map : `dict` [ `lsst.resources.ResourcePath`, \ - `list` [ `uuid.UUID`] ] - Mapping of retrieved artifact path to `DatasetRef` ID. - info_map : `dict` [ `~lsst.resources.ResourcePath`, \ - `StoredDatastoreItemInfo` ] - Mapping of retrieved artifact path to datastore record information. + artifact_map : `dict` [ `lsst.resources.ResourcePath`, `ArtifactMap` ] + Mapping of artifact path to information linking it to the + associated refs and datastore information. root : `lsst.resources.ResourcePath` Root path to be removed from all the paths before creating the index. """ if not refs: return cls( - universe_version=0, - universe_namespace="daf_butler", refs=SerializedDatasetRefContainer.from_refs(refs), - ref_map={}, - info_map={}, + artifact_map={}, ) # Calculate the paths relative to the given root since the Zip file # uses relative paths. - file_to_relative = cls.calc_relative_paths(root, info_map.keys()) + file_to_relative = cls.calc_relative_paths(root, artifact_map.keys()) simplified_refs = SerializedDatasetRefContainer.from_refs(refs) - # Convert the mappings to simplified form for pydantic. - # and use the relative paths that will match the zip. - simplified_ref_map = {} - for path, ids in id_map.items(): - simplified_ref_map[file_to_relative[path]] = ids - simplified_info_map = {file_to_relative[path]: info.to_simple() for path, info in info_map.items()} + # Convert the artifact mapping to relative path. + relative_artifact_map = {file_to_relative[path]: info for path, info in artifact_map.items()} return cls( refs=simplified_refs, - ref_map=simplified_ref_map, - info_map=simplified_info_map, + artifact_map=relative_artifact_map, ) @classmethod @@ -395,7 +413,7 @@ def retrieve_and_zip( destination: ResourcePathExpression, retrieval_callback: Callable[ [Iterable[DatasetRef], ResourcePath, str, bool, bool, bool, bool], - tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]], + tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]], ], ) -> ResourcePath: """Retrieve artifacts from a Butler and place in ZIP file. @@ -444,7 +462,7 @@ def retrieve_and_zip( with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) # Retrieve the artifacts and write the index file. Strip paths. - paths, _, _ = retrieval_callback(refs, tmpdir_path, "auto", False, False, True, True) + paths, _ = retrieval_callback(refs, tmpdir_path, "auto", False, False, True, True) # Read the index to construct file name. index_path = tmpdir_path.join(ZipIndex.index_name, forceDirectory=False) diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index 7cb4717a13..d7aa2dbb3d 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -50,7 +50,7 @@ if TYPE_CHECKING: from lsst.daf.butler import Config, DatasetType, LookupKey from lsst.daf.butler.datastore import DatastoreOpaqueTable - from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo + from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ArtifactIndexInfo from lsst.daf.butler.registry.interfaces import DatasetIdRef, DatastoreRegistryBridgeManager log = logging.getLogger(__name__) @@ -529,7 +529,7 @@ def retrieveArtifacts( overwrite: bool | None = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: """Retrieve the file artifacts associated with the supplied refs. Parameters diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 946fd25e10..2f75f68255 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -1309,7 +1309,7 @@ def retrieveArtifacts( ) -> list[ResourcePath]: # Docstring inherited. outdir = ResourcePath(destination) - paths, _, _ = self._datastore.retrieveArtifacts( + paths, _ = self._datastore.retrieveArtifacts( refs, outdir, transfer=transfer, @@ -1544,13 +1544,13 @@ def ingest_zip(self, zip_file: ResourcePathExpression, transfer: str = "auto") - id_to_ref = {ref.id: ref for ref in refs} datasets: list[FileDataset] = [] processed_ids: set[uuid.UUID] = set() - for path, ids in index.ref_map.items(): + for path_in_zip, index_info in index.artifact_map.items(): # Disassembled composites need to check this ref isn't already # included. - unprocessed = {id_ for id_ in ids if id_ not in processed_ids} + unprocessed = {id_ for id_ in index_info.ids if id_ not in processed_ids} if not unprocessed: continue - dataset = FileDataset(refs=[id_to_ref[id_] for id_ in unprocessed], path=path) + dataset = FileDataset(refs=[id_to_ref[id_] for id_ in unprocessed], path=path_in_zip) datasets.append(dataset) processed_ids.update(unprocessed) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 2fef25a7bd..896d9a1f2c 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -30,7 +30,6 @@ __all__ = ("RemoteButler",) import uuid -from collections import defaultdict from collections.abc import Collection, Iterable, Iterator, Sequence from contextlib import AbstractContextManager, contextmanager from dataclasses import dataclass @@ -39,6 +38,7 @@ from deprecated.sphinx import deprecated from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ( + ArtifactIndexInfo, ZipIndex, determine_destination_for_retrieved_artifact, retrieve_and_zip, @@ -60,7 +60,6 @@ from .._utilities.locked_object import LockedObject from ..datastore import DatasetRefURIs, DatastoreConfig from ..datastore.cache_manager import AbstractDatastoreCacheManager, DatastoreCacheManager -from ..datastore.stored_file_info import StoredFileInfo from ..dimensions import DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId from ..queries import Query from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults @@ -427,7 +426,7 @@ def _retrieve_artifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, list[DatasetId]], dict[ResourcePath, StoredFileInfo]]: + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: destination = ResourcePath(destination).abspath() if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}.") @@ -435,16 +434,16 @@ def _retrieve_artifacts( if transfer not in ("auto", "copy"): raise ValueError("Only 'copy' and 'auto' transfer modes are supported.") - artifact_to_ref_id: dict[ResourcePath, list[DatasetId]] = defaultdict(list) - artifact_to_info: dict[ResourcePath, StoredFileInfo] = {} output_uris: list[ResourcePath] = [] have_copied: dict[ResourcePath, ResourcePath] = {} + artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} for ref in refs: prefix = str(ref.id)[:8] + "-" if add_prefix else "" file_info = _to_file_payload(self._get_file_info_for_ref(ref)).file_info for file in file_info: source_uri = ResourcePath(str(file.url)) # For decam/zip situation we only want to copy once. + # TODO: Unzip zip files on retrieval and merge indexes. cleaned_source_uri = source_uri.replace(fragment="", query="", params="") if cleaned_source_uri not in have_copied: relative_path = ResourcePath(file.datastoreRecords.path, forceAbsolute=False) @@ -456,18 +455,16 @@ def _retrieve_artifacts( target_uri.transfer_from(source_uri, transfer="copy", overwrite=overwrite) output_uris.append(target_uri) have_copied[cleaned_source_uri] = target_uri + artifact_map[target_uri] = ArtifactIndexInfo.from_single(file.datastoreRecords, ref.id) else: target_uri = have_copied[cleaned_source_uri] - # TODO: Fix this with Zip files that need to be unzipped - # on retrieval and index merged. - artifact_to_info[target_uri] = StoredFileInfo.from_simple(file.datastoreRecords) - artifact_to_ref_id[target_uri].append(ref.id) + artifact_map[target_uri].append(ref.id) if write_index: - index = ZipIndex.from_artifact_maps(refs, artifact_to_ref_id, artifact_to_info, destination) + index = ZipIndex.from_artifact_map(refs, artifact_map, destination) index.write_index(destination) - return output_uris, artifact_to_ref_id, artifact_to_info + return output_uris, artifact_map def retrieve_artifacts_zip( self, @@ -484,7 +481,7 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, ) -> list[ResourcePath]: - paths, _, _ = self._retrieve_artifacts( + paths, _ = self._retrieve_artifacts( refs, destination, transfer, diff --git a/tests/test_quantumBackedButler.py b/tests/test_quantumBackedButler.py index fd86ceaa09..97e5276057 100644 --- a/tests/test_quantumBackedButler.py +++ b/tests/test_quantumBackedButler.py @@ -228,7 +228,7 @@ def test_getput(self) -> None: zip_refs = index.refs.to_refs(universe=qbb.dimensions) self.assertEqual(len(zip_refs), 4) self.assertEqual(set(zip_refs), set(self.output_refs)) - self.assertEqual(len(index.info_map), 4) # Count number of artifacts in Zip. + self.assertEqual(len(index.artifact_map), 4) # Count number of artifacts in Zip. def test_getDeferred(self) -> None: """Test for getDeferred method""" From 4576a806c2e29a96716c1b858b429f5c1929c442 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Sat, 26 Oct 2024 10:51:26 -0700 Subject: [PATCH 25/39] Switch to sets just in case there are duplicates Which is rare with the new query system but is possible. --- .../butler/datastores/file_datastore/retrieve_artifacts.py | 4 ++-- python/lsst/daf/butler/script/retrieveArtifacts.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 85752d870f..41c055a0dc 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -183,7 +183,7 @@ class ArtifactIndexInfo(BaseModel): info: SerializedStoredFileInfo """Datastore record information for this file artifact.""" - ids: list[uuid.UUID] + ids: set[uuid.UUID] """Dataset IDs for this artifact.""" def append(self, id_: uuid.UUID) -> None: @@ -194,7 +194,7 @@ def append(self, id_: uuid.UUID) -> None: id_ : `uuid.UUID` Additional dataset ID to associate with this artifact. """ - self.ids.append(id_) + self.ids.add(id_) @classmethod def from_single(cls, info: SerializedStoredFileInfo, id_: uuid.UUID) -> Self: diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index eea5537c66..45b1c3e4b0 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -104,7 +104,7 @@ def retrieveArtifacts( butler = Butler.from_config(repo, writeable=False) - # Need to store in list so we can count the number to give some feedback + # Need to store in set so we can count the number to give some feedback # to caller. query = QueryDatasets( butler=butler, @@ -116,7 +116,7 @@ def retrieveArtifacts( order_by=order_by, show_uri=False, ) - refs = list(itertools.chain(*query.getDatasets())) + refs = set(itertools.chain(*query.getDatasets())) log.info("Number of datasets matching query: %d", len(refs)) if not refs: return [] From c8cb40679a569f6bebfc19b1c5b37d6a13d72567 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Sat, 26 Oct 2024 12:08:15 -0700 Subject: [PATCH 26/39] Remove a warning in test code and report warnings from caller --- python/lsst/daf/butler/registry/queries/_results.py | 4 ++++ python/lsst/daf/butler/script/queryDatasets.py | 2 ++ tests/test_cliCmdPruneDatasets.py | 5 ++++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/registry/queries/_results.py b/python/lsst/daf/butler/registry/queries/_results.py index c643e7ce09..fbb904a0ec 100644 --- a/python/lsst/daf/butler/registry/queries/_results.py +++ b/python/lsst/daf/butler/registry/queries/_results.py @@ -47,6 +47,7 @@ from typing import Any, Self from deprecated.sphinx import deprecated +from lsst.utils.introspection import find_outside_stacklevel from ..._dataset_ref import DatasetRef from ..._dataset_type import DatasetType @@ -537,6 +538,9 @@ def limit(self, limit: int, offset: int | None | EllipsisType = ...) -> Self: "'offset' parameter should no longer be used. It is not supported by the new query system." " Will be removed after v28.", FutureWarning, + stacklevel=find_outside_stacklevel( + "lsst.daf.butler", allow_modules={"lsst.daf.butler.registry.tests"} + ), ) if offset is None or offset is ...: offset = 0 diff --git a/python/lsst/daf/butler/script/queryDatasets.py b/python/lsst/daf/butler/script/queryDatasets.py index 838fe7fe50..739ea8eb30 100644 --- a/python/lsst/daf/butler/script/queryDatasets.py +++ b/python/lsst/daf/butler/script/queryDatasets.py @@ -191,6 +191,7 @@ def __init__( warnings.warn( "No --collections specified. The --collections argument will become mandatory after v28.", FutureWarning, + stacklevel=2, ) glob = list(glob) if not glob: @@ -198,6 +199,7 @@ def __init__( "No dataset types specified. Explicitly specifying dataset types will become mandatory" " after v28. Specify '*' to match the current behavior of querying all dataset types.", FutureWarning, + stacklevel=2, ) # show_uri requires a datastore. diff --git a/tests/test_cliCmdPruneDatasets.py b/tests/test_cliCmdPruneDatasets.py index 59c4c2274d..8c16097151 100644 --- a/tests/test_cliCmdPruneDatasets.py +++ b/tests/test_cliCmdPruneDatasets.py @@ -80,6 +80,9 @@ def getRefs(): def makeQueryDatasets(*args, **kwargs): """Return a query datasets object.""" + if not kwargs.get("glob"): + # Use all dataset types if not specified. + kwargs["glob"] = ("*",) return QueryDatasets(*args, **kwargs) @@ -98,7 +101,7 @@ def setUp(self): @staticmethod def makeQueryDatasetsArgs(*, repo, **kwargs): expectedArgs = dict( - repo=repo, collections=(), where="", find_first=True, show_uri=False, glob=tuple() + repo=repo, collections=("*",), where="", find_first=True, show_uri=False, glob=tuple() ) expectedArgs.update(kwargs) return expectedArgs From a57e725d0f40e0373de2ec639d6e18e76f22f432 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Mon, 28 Oct 2024 15:57:50 -0700 Subject: [PATCH 27/39] Support zip unpacking in retrieve artifacts --- .../daf/butler/datastores/fileDatastore.py | 24 +++-- .../file_datastore/retrieve_artifacts.py | 97 ++++++++++++++++++- .../butler/remote_butler/_remote_butler.py | 18 +++- 3 files changed, 128 insertions(+), 11 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 3dd3173879..eb82465162 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -84,6 +84,7 @@ ArtifactIndexInfo, ZipIndex, determine_destination_for_retrieved_artifact, + unpack_zips, ) from lsst.daf.butler.datastores.fileDatastoreClient import ( FileDatastoreGetPayload, @@ -2100,6 +2101,7 @@ def retrieveArtifacts( # This also helps filter out duplicate DatasetRef in the request # that will map to the same underlying file transfer. to_transfer: dict[ResourcePath, ResourcePath] = {} + zips_to_transfer: set[ResourcePath] = set() # Retrieve all the records in bulk indexed by ref.id. records = self._get_stored_records_associated_with_refs(refs, ignore_datastore_records=True) @@ -2125,10 +2127,16 @@ def retrieveArtifacts( location = info.file_location(self.locationFactory) source_uri = location.uri # For DECam/zip we only want to copy once. + # For zip files we need to unpack so that they can be + # zipped up again if needed. + is_zip = source_uri.getExtension() == ".zip" and "zip-path" in source_uri.fragment # We need to remove fragments for consistency. - # TODO: Unzip zip files on retrieval and merge indexes. cleaned_source_uri = source_uri.replace(fragment="", query="", params="") - if cleaned_source_uri not in to_transfer: + if is_zip: + # Assume the DatasetRef definitions are within the Zip + # file itself and so can be dropped from loop. + zips_to_transfer.add(cleaned_source_uri) + elif cleaned_source_uri not in to_transfer: target_uri = determine_destination_for_retrieved_artifact( destination, location.pathInStore, preserve_path, prefix ) @@ -2143,11 +2151,15 @@ def retrieveArtifacts( for source_uri, target_uri in to_transfer.items(): target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) + # Transfer the Zip files and unpack them. + zipped_artifacts = unpack_zips(zips_to_transfer, requested_ids, destination, preserve_path) + artifact_map.update(zipped_artifacts) + if write_index: index = ZipIndex.from_artifact_map(refs, artifact_map, destination) index.write_index(destination) - return list(to_transfer.values()), artifact_map + return list(artifact_map.keys()), artifact_map def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: """Ingest an indexed Zip file and contents. @@ -2203,10 +2215,8 @@ def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: # Reference in original location. tgtLocation = None else: - zip_name = index.calculate_zip_file_name() - # Zip name is UUID so add subdir of first few characters of UUID - # to spread things out if there are many Zip files. - tgtLocation = self.locationFactory.fromPath(f"zips/{zip_name[:4]}/{zip_name}") + # Name the zip file based on index contents. + tgtLocation = self.locationFactory.fromPath(index.calculate_zip_file_path_in_store()) if not tgtLocation.uri.dirname().exists(): log.debug("Folder %s does not exist yet.", tgtLocation.uri.dirname()) tgtLocation.uri.dirname().mkdir() diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 41c055a0dc..def71e663c 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -27,7 +27,7 @@ from __future__ import annotations -__all__ = ("determine_destination_for_retrieved_artifact", "retrieve_and_zip", "ZipIndex") +__all__ = ("determine_destination_for_retrieved_artifact", "retrieve_and_zip", "unpack_zips", "ZipIndex") import logging import tempfile @@ -209,6 +209,29 @@ def from_single(cls, info: SerializedStoredFileInfo, id_: uuid.UUID) -> Self: """ return cls(info=info, ids=[id_]) + def subset(self, ids: Iterable[uuid.UUID]) -> Self: + """Replace the IDs with a subset of the IDs and return a new instance. + + Parameters + ---------- + ids : `~collections.abc.Iterable` [ `uuid.UUID` ] + Subset of IDs to keep. + + Returns + ------- + subsetted : `ArtifactIndexInfo` + New instance with the requested subset. + + Raises + ------ + ValueError + Raised if the given IDs is not a subset of the current IDs. + """ + subset = set(ids) + if subset - self.ids: + raise ValueError(f"Given subset of {subset} is not a subset of {self.ids}") + return type(self)(ids=subset, info=self.info.model_copy()) + class ZipIndex(BaseModel): """Index of a Zip file of Butler datasets. @@ -264,6 +287,18 @@ def calculate_zip_file_name(self) -> str: """ return f"{self.generate_uuid5()}.zip" + def calculate_zip_file_path_in_store(self) -> str: + """Calculate the relative path inside a datastore that should be + used for this Zip file. + + Returns + ------- + path_in_store : `str` + Relative path to use for Zip file in datastore. + """ + zip_name = self.calculate_zip_file_name() + return f"zips/{zip_name[:4]}/{zip_name}" + def write_index(self, dir: ResourcePath) -> ResourcePath: """Write the index to the specified directory. @@ -478,3 +513,63 @@ def retrieve_and_zip( zip.write(path.ospath, name) return zip_path + + +def unpack_zips( + zips_to_transfer: Iterable[ResourcePath], + allowed_ids: set[uuid.UUID], + destination: ResourcePath, + preserve_path: bool, +) -> dict[ResourcePath, ArtifactIndexInfo]: + """Transfer the Zip files and unpack them in the destination directory. + + Parameters + ---------- + zips_to_transfer : `~collections.abc.Iterable` \ + [ `~lsst.resources.ResourcePath` ] + Paths to Zip files to unpack. These must be Zip files that include + the index information and were created by the Butler. + allowed_ids : `set` [ `uuid.UUID` ] + All the possible dataset IDs for which artifacts should be extracted + from the Zip file. If an ID in the Zip file is not present in this + list the artifact will not be extracted from the Zip. + destination : `~lsst.resources.ResourcePath` + Output destination for the Zip contents. + preserve_path : `bool` + Whether to include subdirectories during extraction. If `True` a + directory will be made per Zip. + + Returns + ------- + artifact_map : `dict` \ + [ `~lsst.resources.ResourcePath`, `ArtifactIndexInfo` ] + Path linking Zip contents location to associated artifact information. + """ + artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} + for source_uri in zips_to_transfer: + _LOG.debug("Unpacking zip file %s", source_uri) + # Assume that downloading to temporary location is more efficient + # than trying to read the contents remotely. + with ResourcePath.temporary_uri(suffix=".zip") as temp: + temp.transfer_from(source_uri, transfer="auto") + index = ZipIndex.from_zip_file(temp) + + if preserve_path: + subdir = ResourcePath( + index.calculate_zip_file_path_in_store(), forceDirectory=False, forceAbsolute=False + ).dirname() + outdir = destination.join(subdir) + else: + outdir = destination + outdir.mkdir() + with temp.open("rb") as fd, zipfile.ZipFile(fd) as zf: + for path_in_zip, artifact_info in index.artifact_map.items(): + # Skip if this specific dataset ref is not requested. + included_ids = artifact_info.ids & allowed_ids + if included_ids: + # Do not apply a new prefix since the zip file + # should already have a prefix. + zf.extract(path_in_zip, path=outdir.ospath) + output_path = outdir.join(path_in_zip, forceDirectory=False) + artifact_map[output_path] = artifact_info.subset(included_ids) + return artifact_map diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 896d9a1f2c..cff6f8165b 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -42,6 +42,7 @@ ZipIndex, determine_destination_for_retrieved_artifact, retrieve_and_zip, + unpack_zips, ) from lsst.daf.butler.datastores.fileDatastoreClient import ( FileDatastoreGetPayload, @@ -434,6 +435,7 @@ def _retrieve_artifacts( if transfer not in ("auto", "copy"): raise ValueError("Only 'copy' and 'auto' transfer modes are supported.") + requested_ids = {ref.id for ref in refs} output_uris: list[ResourcePath] = [] have_copied: dict[ResourcePath, ResourcePath] = {} artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} @@ -442,10 +444,20 @@ def _retrieve_artifacts( file_info = _to_file_payload(self._get_file_info_for_ref(ref)).file_info for file in file_info: source_uri = ResourcePath(str(file.url)) - # For decam/zip situation we only want to copy once. - # TODO: Unzip zip files on retrieval and merge indexes. + # For DECam/zip we only want to copy once. + # For zip files we need to unpack so that they can be + # zipped up again if needed. + is_zip = source_uri.getExtension() == ".zip" and "zip-path" in source_uri.fragment cleaned_source_uri = source_uri.replace(fragment="", query="", params="") - if cleaned_source_uri not in have_copied: + if is_zip: + if cleaned_source_uri not in have_copied: + zipped_artifacts = unpack_zips( + [cleaned_source_uri], requested_ids, destination, preserve_path + ) + artifact_map.update(zipped_artifacts) + have_copied[cleaned_source_uri] = cleaned_source_uri + output_uris.extend(artifact_map.keys()) + elif cleaned_source_uri not in have_copied: relative_path = ResourcePath(file.datastoreRecords.path, forceAbsolute=False) target_uri = determine_destination_for_retrieved_artifact( destination, relative_path, preserve_path, prefix From 97c2d29dcd903919335f2669fa5c2125beec5a8f Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 07:26:00 -0700 Subject: [PATCH 28/39] Include version information in ZipIndex model --- .../file_datastore/retrieve_artifacts.py | 37 ++++++++++++++++--- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index def71e663c..3b7bcc8a50 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -34,12 +34,12 @@ import uuid import zipfile from collections.abc import Callable, Iterable -from typing import ClassVar, Self +from typing import Annotated, ClassVar, Literal, Self, TypeAlias from lsst.daf.butler import DatasetIdFactory, DatasetRef from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo from lsst.resources import ResourcePath, ResourcePathExpression -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict, Field from ..._dataset_type import DatasetType, SerializedDatasetType from ...dimensions import DataCoordinate, DimensionUniverse, SerializedDataCoordinate, SerializedDataId @@ -54,6 +54,8 @@ class MinimalistDatasetRef(BaseModel): to this information. """ + model_config = ConfigDict(frozen=True) + dataset_type_name: str """Name of the dataset type.""" @@ -70,6 +72,18 @@ class SerializedDatasetRefContainer(BaseModel): Dimension records are not included. """ + model_config = ConfigDict(extra="allow", frozen=True) + container_version: str + + +class SerializedDatasetRefContainerV1(SerializedDatasetRefContainer): + """Serializable model for a collection of DatasetRef. + + Dimension records are not included. + """ + + container_version: Literal["V1"] = "V1" + universe_version: int """Dimension universe version.""" @@ -177,6 +191,12 @@ def to_refs(self, universe: DimensionUniverse) -> list[DatasetRef]: return refs +SerializedDatasetRefContainers: TypeAlias = Annotated[ + SerializedDatasetRefContainerV1, + Field(discriminator="container_version"), +] + + class ArtifactIndexInfo(BaseModel): """Information related to an artifact in an index.""" @@ -242,7 +262,9 @@ class ZipIndex(BaseModel): file datastore. """ - refs: SerializedDatasetRefContainer + index_version: Literal["V1"] = "V1" + + refs: SerializedDatasetRefContainers """Deduplicated information for all the `DatasetRef` in the index.""" artifact_map: dict[str, ArtifactIndexInfo] @@ -314,7 +336,10 @@ def write_index(self, dir: ResourcePath) -> ResourcePath: """ index_path = dir.join(self.index_name, forceDirectory=False) with index_path.open("w") as fd: - print(self.model_dump_json(exclude_defaults=True, exclude_unset=True), file=fd) + # Need to include unset/default values so that the version + # discriminator field for refs container appears in the + # serialization. + print(self.model_dump_json(), file=fd) return index_path @classmethod @@ -367,7 +392,7 @@ def from_artifact_map( """ if not refs: return cls( - refs=SerializedDatasetRefContainer.from_refs(refs), + refs=SerializedDatasetRefContainerV1.from_refs(refs), artifact_map={}, ) @@ -375,7 +400,7 @@ def from_artifact_map( # uses relative paths. file_to_relative = cls.calc_relative_paths(root, artifact_map.keys()) - simplified_refs = SerializedDatasetRefContainer.from_refs(refs) + simplified_refs = SerializedDatasetRefContainerV1.from_refs(refs) # Convert the artifact mapping to relative path. relative_artifact_map = {file_to_relative[path]: info for path, info in artifact_map.items()} From 7db9443964920ce4b926ad90e307c2f9c4b03a35 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 13:02:35 -0700 Subject: [PATCH 29/39] Only serialize required dimensions for dataset type and data coordinate There is nothing to be gained by including all the dimensions other than taking extra space. --- python/lsst/daf/butler/_dataset_type.py | 2 +- python/lsst/daf/butler/dimensions/_coordinate.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/_dataset_type.py b/python/lsst/daf/butler/_dataset_type.py index c7b546cc94..e9ba515ef9 100644 --- a/python/lsst/daf/butler/_dataset_type.py +++ b/python/lsst/daf/butler/_dataset_type.py @@ -676,7 +676,7 @@ def to_simple(self, minimal: bool = False) -> SerializedDatasetType: "name": self.name, "storageClass": self._storageClassName, "isCalibration": self._isCalibration, - "dimensions": list(self._dimensions.names), + "dimensions": list(self._dimensions.required), } if self._parentStorageClassName is not None: diff --git a/python/lsst/daf/butler/dimensions/_coordinate.py b/python/lsst/daf/butler/dimensions/_coordinate.py index d6f4189e9c..d511a49cac 100644 --- a/python/lsst/daf/butler/dimensions/_coordinate.py +++ b/python/lsst/daf/butler/dimensions/_coordinate.py @@ -709,7 +709,7 @@ def to_simple(self, minimal: bool = False) -> SerializedDataCoordinate: else: records = None - return SerializedDataCoordinate(dataId=dict(self.mapping), records=records) + return SerializedDataCoordinate(dataId=dict(self.required), records=records) @classmethod def from_simple( From f2cbd09c184dee95b5878f2d2664e583829c9e8a Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 13:55:08 -0700 Subject: [PATCH 30/39] Ensure that zip unpacking also respects overwrite flag --- python/lsst/daf/butler/datastores/fileDatastore.py | 2 +- .../datastores/file_datastore/retrieve_artifacts.py | 9 +++++++++ python/lsst/daf/butler/remote_butler/_remote_butler.py | 2 +- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index eb82465162..f677028714 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2152,7 +2152,7 @@ def retrieveArtifacts( target_uri.transfer_from(source_uri, transfer=transfer, overwrite=overwrite) # Transfer the Zip files and unpack them. - zipped_artifacts = unpack_zips(zips_to_transfer, requested_ids, destination, preserve_path) + zipped_artifacts = unpack_zips(zips_to_transfer, requested_ids, destination, preserve_path, overwrite) artifact_map.update(zipped_artifacts) if write_index: diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 3b7bcc8a50..723ac40b63 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -545,6 +545,7 @@ def unpack_zips( allowed_ids: set[uuid.UUID], destination: ResourcePath, preserve_path: bool, + overwrite: bool, ) -> dict[ResourcePath, ArtifactIndexInfo]: """Transfer the Zip files and unpack them in the destination directory. @@ -563,6 +564,9 @@ def unpack_zips( preserve_path : `bool` Whether to include subdirectories during extraction. If `True` a directory will be made per Zip. + overwrite : `bool`, optional + If `True` allow transfers to overwrite existing files at the + destination. Returns ------- @@ -596,5 +600,10 @@ def unpack_zips( # should already have a prefix. zf.extract(path_in_zip, path=outdir.ospath) output_path = outdir.join(path_in_zip, forceDirectory=False) + if not overwrite and output_path.exists(): + raise FileExistsError( + f"Destination path '{output_path}' already exists. " + "Extraction from Zip cannot be completed." + ) artifact_map[output_path] = artifact_info.subset(included_ids) return artifact_map diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index cff6f8165b..9eb018798e 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -452,7 +452,7 @@ def _retrieve_artifacts( if is_zip: if cleaned_source_uri not in have_copied: zipped_artifacts = unpack_zips( - [cleaned_source_uri], requested_ids, destination, preserve_path + [cleaned_source_uri], requested_ids, destination, preserve_path, overwrite ) artifact_map.update(zipped_artifacts) have_copied[cleaned_source_uri] = cleaned_source_uri From 516d714cacb6baf309d943d7999aa56a7b74af35 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 14:23:24 -0700 Subject: [PATCH 31/39] Use a protocol for retrieve artifacts callback This makes it much more explicit what the parameters mean. --- .../file_datastore/retrieve_artifacts.py | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 723ac40b63..9afb081391 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -33,8 +33,8 @@ import tempfile import uuid import zipfile -from collections.abc import Callable, Iterable -from typing import Annotated, ClassVar, Literal, Self, TypeAlias +from collections.abc import Iterable +from typing import Annotated, ClassVar, Literal, Protocol, Self, TypeAlias from lsst.daf.butler import DatasetIdFactory, DatasetRef from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo @@ -468,13 +468,23 @@ def determine_destination_for_retrieved_artifact( return target_uri +class RetrievalCallable(Protocol): + def __call__( + self, + refs: Iterable[DatasetRef], + destination: ResourcePath, + transfer: str, + preserve_path: bool, + overwrite: bool, + write_index: bool, + add_prefix: bool, + ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: ... + + def retrieve_and_zip( refs: Iterable[DatasetRef], destination: ResourcePathExpression, - retrieval_callback: Callable[ - [Iterable[DatasetRef], ResourcePath, str, bool, bool, bool, bool], - tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]], - ], + retrieval_callback: RetrievalCallable, ) -> ResourcePath: """Retrieve artifacts from a Butler and place in ZIP file. @@ -522,7 +532,15 @@ def retrieve_and_zip( with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) # Retrieve the artifacts and write the index file. Strip paths. - paths, _ = retrieval_callback(refs, tmpdir_path, "auto", False, False, True, True) + paths, _ = retrieval_callback( + refs=refs, + destination=tmpdir_path, + transfer="auto", + preserve_path=False, + overwrite=False, + write_index=True, + add_prefix=True, + ) # Read the index to construct file name. index_path = tmpdir_path.join(ZipIndex.index_name, forceDirectory=False) From 0c75a823b6ee4eb4a6d727e846e8078bd47bd7d7 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 14:43:46 -0700 Subject: [PATCH 32/39] Simplify Datastore.retrieveArtifacts return value The mapping included all the paths so no reason to return the paths separately. --- python/lsst/daf/butler/datastore/_datastore.py | 7 ++----- python/lsst/daf/butler/datastores/chainedDatastore.py | 11 +++-------- python/lsst/daf/butler/datastores/fileDatastore.py | 7 ++----- .../datastores/file_datastore/retrieve_artifacts.py | 6 +++--- .../lsst/daf/butler/datastores/inMemoryDatastore.py | 2 +- .../lsst/daf/butler/direct_butler/_direct_butler.py | 4 ++-- .../lsst/daf/butler/remote_butler/_remote_butler.py | 11 ++++------- 7 files changed, 17 insertions(+), 31 deletions(-) diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py index d3de5fedda..7b09beda60 100644 --- a/python/lsst/daf/butler/datastore/_datastore.py +++ b/python/lsst/daf/butler/datastore/_datastore.py @@ -1026,7 +1026,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: + ) -> dict[ResourcePath, ArtifactIndexInfo]: """Retrieve the artifacts associated with the supplied refs. Parameters @@ -1058,9 +1058,6 @@ def retrieveArtifacts( Returns ------- - targets : `list` of `lsst.resources.ResourcePath` - URIs of file artifacts in destination location. Order is not - preserved. artifact_map : `dict` [ `lsst.resources.ResourcePath`, \ `ArtifactIndexInfo` ] Mapping of retrieved file to associated index information. @@ -1491,7 +1488,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: + ) -> dict[ResourcePath, ArtifactIndexInfo]: raise NotImplementedError("This is a no-op datastore that can not access a real datastore") def remove(self, datasetRef: DatasetRef) -> None: diff --git a/python/lsst/daf/butler/datastores/chainedDatastore.py b/python/lsst/daf/butler/datastores/chainedDatastore.py index 1ad8b637a9..d2f1859fa7 100644 --- a/python/lsst/daf/butler/datastores/chainedDatastore.py +++ b/python/lsst/daf/butler/datastores/chainedDatastore.py @@ -812,7 +812,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: + ) -> dict[ResourcePath, ArtifactIndexInfo]: """Retrieve the file artifacts associated with the supplied refs. Parameters @@ -843,9 +843,6 @@ def retrieveArtifacts( Returns ------- - targets : `list` of `lsst.resources.ResourcePath` - URIs of file artifacts in destination location. Order is not - preserved. artifact_map : `dict` [ `lsst.resources.ResourcePath`, \ `ArtifactIndexInfo` ] Mapping of retrieved file to associated index information. @@ -893,10 +890,9 @@ def retrieveArtifacts( raise RuntimeError(f"Some datasets were not found in any datastores: {pending}") # Now do the transfer. - targets: list[ResourcePath] = [] merged_artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} for number, datastore_refs in grouped_by_datastore.items(): - retrieved, artifact_map = self.datastores[number].retrieveArtifacts( + artifact_map = self.datastores[number].retrieveArtifacts( datastore_refs, destination, transfer=transfer, @@ -905,14 +901,13 @@ def retrieveArtifacts( write_index=False, # Disable index writing regardless. add_prefix=add_prefix, ) - targets.extend(retrieved) merged_artifact_map.update(artifact_map) if write_index: index = ZipIndex.from_artifact_map(refs, merged_artifact_map, destination) index.write_index(destination) - return targets, merged_artifact_map + return merged_artifact_map def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: """Ingest an indexed Zip file and contents. diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index f677028714..7928747bd1 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2052,7 +2052,7 @@ def retrieveArtifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: + ) -> dict[ResourcePath, ArtifactIndexInfo]: """Retrieve the file artifacts associated with the supplied refs. Parameters @@ -2084,9 +2084,6 @@ def retrieveArtifacts( Returns ------- - targets : `list` of `lsst.resources.ResourcePath` - URIs of file artifacts in destination location. Order is not - preserved. artifact_map : `dict` [ `lsst.resources.ResourcePath`, \ `ArtifactIndexInfo` ] Mapping of retrieved file to associated index information. @@ -2159,7 +2156,7 @@ def retrieveArtifacts( index = ZipIndex.from_artifact_map(refs, artifact_map, destination) index.write_index(destination) - return list(artifact_map.keys()), artifact_map + return artifact_map def ingest_zip(self, zip_path: ResourcePath, transfer: str | None) -> None: """Ingest an indexed Zip file and contents. diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 9afb081391..41af46ea6f 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -478,7 +478,7 @@ def __call__( overwrite: bool, write_index: bool, add_prefix: bool, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: ... + ) -> dict[ResourcePath, ArtifactIndexInfo]: ... def retrieve_and_zip( @@ -532,7 +532,7 @@ def retrieve_and_zip( with tempfile.TemporaryDirectory(dir=outdir.ospath, ignore_cleanup_errors=True) as tmpdir: tmpdir_path = ResourcePath(tmpdir, forceDirectory=True) # Retrieve the artifacts and write the index file. Strip paths. - paths, _ = retrieval_callback( + artifact_map = retrieval_callback( refs=refs, destination=tmpdir_path, transfer="auto", @@ -552,7 +552,7 @@ def retrieve_and_zip( zip_path = outdir.join(zip_file_name, forceDirectory=False) with zipfile.ZipFile(zip_path.ospath, "w") as zip: zip.write(index_path.ospath, index_path.basename()) - for path, name in index.calc_relative_paths(tmpdir_path, paths).items(): + for path, name in index.calc_relative_paths(tmpdir_path, list(artifact_map)).items(): zip.write(path.ospath, name) return zip_path diff --git a/python/lsst/daf/butler/datastores/inMemoryDatastore.py b/python/lsst/daf/butler/datastores/inMemoryDatastore.py index d7aa2dbb3d..ecd0e1b9ff 100644 --- a/python/lsst/daf/butler/datastores/inMemoryDatastore.py +++ b/python/lsst/daf/butler/datastores/inMemoryDatastore.py @@ -529,7 +529,7 @@ def retrieveArtifacts( overwrite: bool | None = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: + ) -> dict[ResourcePath, ArtifactIndexInfo]: """Retrieve the file artifacts associated with the supplied refs. Parameters diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index 2f75f68255..ac75275d06 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -1309,7 +1309,7 @@ def retrieveArtifacts( ) -> list[ResourcePath]: # Docstring inherited. outdir = ResourcePath(destination) - paths, _ = self._datastore.retrieveArtifacts( + artifact_map = self._datastore.retrieveArtifacts( refs, outdir, transfer=transfer, @@ -1317,7 +1317,7 @@ def retrieveArtifacts( overwrite=overwrite, write_index=True, ) - return paths + return list(artifact_map) def exists( self, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 9eb018798e..43ab1697c7 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -427,7 +427,7 @@ def _retrieve_artifacts( overwrite: bool = False, write_index: bool = True, add_prefix: bool = False, - ) -> tuple[list[ResourcePath], dict[ResourcePath, ArtifactIndexInfo]]: + ) -> dict[ResourcePath, ArtifactIndexInfo]: destination = ResourcePath(destination).abspath() if not destination.isdir(): raise ValueError(f"Destination location must refer to a directory. Given {destination}.") @@ -436,7 +436,6 @@ def _retrieve_artifacts( raise ValueError("Only 'copy' and 'auto' transfer modes are supported.") requested_ids = {ref.id for ref in refs} - output_uris: list[ResourcePath] = [] have_copied: dict[ResourcePath, ResourcePath] = {} artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} for ref in refs: @@ -456,7 +455,6 @@ def _retrieve_artifacts( ) artifact_map.update(zipped_artifacts) have_copied[cleaned_source_uri] = cleaned_source_uri - output_uris.extend(artifact_map.keys()) elif cleaned_source_uri not in have_copied: relative_path = ResourcePath(file.datastoreRecords.path, forceAbsolute=False) target_uri = determine_destination_for_retrieved_artifact( @@ -465,7 +463,6 @@ def _retrieve_artifacts( # Because signed URLs expire, we want to do the transfer # soon after retrieving the URL. target_uri.transfer_from(source_uri, transfer="copy", overwrite=overwrite) - output_uris.append(target_uri) have_copied[cleaned_source_uri] = target_uri artifact_map[target_uri] = ArtifactIndexInfo.from_single(file.datastoreRecords, ref.id) else: @@ -476,7 +473,7 @@ def _retrieve_artifacts( index = ZipIndex.from_artifact_map(refs, artifact_map, destination) index.write_index(destination) - return output_uris, artifact_map + return artifact_map def retrieve_artifacts_zip( self, @@ -493,14 +490,14 @@ def retrieveArtifacts( preserve_path: bool = True, overwrite: bool = False, ) -> list[ResourcePath]: - paths, _ = self._retrieve_artifacts( + artifact_map = self._retrieve_artifacts( refs, destination, transfer, preserve_path, overwrite, ) - return paths + return list(artifact_map) def exists( self, From 13b0f15c6bb0a716344ce6f1d90734eb3217e83a Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 14:58:27 -0700 Subject: [PATCH 33/39] Support clobber/no-clobber when writing Zip archive --- python/lsst/daf/butler/_butler.py | 4 ++++ python/lsst/daf/butler/_quantum_backed.py | 6 +++++- .../datastores/file_datastore/retrieve_artifacts.py | 8 +++++++- python/lsst/daf/butler/direct_butler/_direct_butler.py | 3 ++- python/lsst/daf/butler/remote_butler/_remote_butler.py | 3 ++- python/lsst/daf/butler/script/retrieveArtifacts.py | 2 +- python/lsst/daf/butler/tests/hybrid_butler.py | 3 ++- 7 files changed, 23 insertions(+), 6 deletions(-) diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py index c45fec258a..fbfedd6bb7 100644 --- a/python/lsst/daf/butler/_butler.py +++ b/python/lsst/daf/butler/_butler.py @@ -992,6 +992,7 @@ def retrieve_artifacts_zip( self, refs: Iterable[DatasetRef], destination: ResourcePathExpression, + overwrite: bool = True, ) -> ResourcePath: """Retrieve artifacts from a Butler and place in ZIP file. @@ -1003,6 +1004,9 @@ def retrieve_artifacts_zip( Directory to write the new ZIP file. This directory will also be used as a staging area for the datasets being downloaded from the datastore. + overwrite : `bool`, optional + If `False` the output Zip will not be written if a file of the + same name is already present in ``destination``. Returns ------- diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py index 731e5a6183..067fb93c45 100644 --- a/python/lsst/daf/butler/_quantum_backed.py +++ b/python/lsst/daf/butler/_quantum_backed.py @@ -503,6 +503,7 @@ def retrieve_artifacts_zip( self, refs: Iterable[DatasetRef], destination: ResourcePathExpression, + overwrite: bool = True, ) -> ResourcePath: """Retrieve artifacts from the graph and place in ZIP file. @@ -514,6 +515,9 @@ def retrieve_artifacts_zip( Directory to write the new ZIP file. This directory will also be used as a staging area for the datasets being downloaded from the datastore. + overwrite : `bool`, optional + If `False` the output Zip will not be written if a file of the + same name is already present in ``destination``. Returns ------- @@ -525,7 +529,7 @@ def retrieve_artifacts_zip( ValueError Raised if there are no refs to retrieve. """ - return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts) + return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts, overwrite) def extract_provenance_data(self) -> QuantumProvenanceData: """Extract provenance information and datastore records from this diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 41af46ea6f..583d4ede16 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -485,6 +485,7 @@ def retrieve_and_zip( refs: Iterable[DatasetRef], destination: ResourcePathExpression, retrieval_callback: RetrievalCallable, + overwrite: bool = True, ) -> ResourcePath: """Retrieve artifacts from a Butler and place in ZIP file. @@ -501,6 +502,9 @@ def retrieve_and_zip( Bound method for a function that can retrieve the artifacts and return the metadata necessary for creating the zip index. For example `lsst.daf.butler.datastore.Datastore.retrieveArtifacts`. + overwrite : `bool`, optional + If `False` the output Zip will not be written if a file of the + same name is already present in ``destination``. Returns ------- @@ -550,6 +554,8 @@ def retrieve_and_zip( # Use unique name based on files in Zip. zip_file_name = index.calculate_zip_file_name() zip_path = outdir.join(zip_file_name, forceDirectory=False) + if not overwrite and zip_path.exists(): + raise FileExistsError(f"Output Zip at {zip_path} already exists but cannot overwrite.") with zipfile.ZipFile(zip_path.ospath, "w") as zip: zip.write(index_path.ospath, index_path.basename()) for path, name in index.calc_relative_paths(tmpdir_path, list(artifact_map)).items(): @@ -616,12 +622,12 @@ def unpack_zips( if included_ids: # Do not apply a new prefix since the zip file # should already have a prefix. - zf.extract(path_in_zip, path=outdir.ospath) output_path = outdir.join(path_in_zip, forceDirectory=False) if not overwrite and output_path.exists(): raise FileExistsError( f"Destination path '{output_path}' already exists. " "Extraction from Zip cannot be completed." ) + zf.extract(path_in_zip, path=outdir.ospath) artifact_map[output_path] = artifact_info.subset(included_ids) return artifact_map diff --git a/python/lsst/daf/butler/direct_butler/_direct_butler.py b/python/lsst/daf/butler/direct_butler/_direct_butler.py index ac75275d06..892bcaa8f5 100644 --- a/python/lsst/daf/butler/direct_butler/_direct_butler.py +++ b/python/lsst/daf/butler/direct_butler/_direct_butler.py @@ -1296,8 +1296,9 @@ def retrieve_artifacts_zip( self, refs: Iterable[DatasetRef], destination: ResourcePathExpression, + overwrite: bool = True, ) -> ResourcePath: - return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts) + return retrieve_and_zip(refs, destination, self._datastore.retrieveArtifacts, overwrite) def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 43ab1697c7..63d8b7c9be 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -479,8 +479,9 @@ def retrieve_artifacts_zip( self, refs: Iterable[DatasetRef], destination: ResourcePathExpression, + overwrite: bool = True, ) -> ResourcePath: - return retrieve_and_zip(refs, destination, self._retrieve_artifacts) + return retrieve_and_zip(refs, destination, self._retrieve_artifacts, overwrite) def retrieveArtifacts( self, diff --git a/python/lsst/daf/butler/script/retrieveArtifacts.py b/python/lsst/daf/butler/script/retrieveArtifacts.py index 45b1c3e4b0..842e0fae00 100644 --- a/python/lsst/daf/butler/script/retrieveArtifacts.py +++ b/python/lsst/daf/butler/script/retrieveArtifacts.py @@ -126,7 +126,7 @@ def retrieveArtifacts( refs, destination=destination, transfer=transfer, preserve_path=preserve_path, overwrite=clobber ) else: - zip_file = butler.retrieve_artifacts_zip(refs, destination=destination) + zip_file = butler.retrieve_artifacts_zip(refs, destination=destination, overwrite=clobber) transferred = [zip_file] return transferred diff --git a/python/lsst/daf/butler/tests/hybrid_butler.py b/python/lsst/daf/butler/tests/hybrid_butler.py index 5fef275fbf..41181de15c 100644 --- a/python/lsst/daf/butler/tests/hybrid_butler.py +++ b/python/lsst/daf/butler/tests/hybrid_butler.py @@ -210,8 +210,9 @@ def retrieve_artifacts_zip( self, refs: Iterable[DatasetRef], destination: ResourcePathExpression, + overwrite: bool = True, ) -> ResourcePath: - return self._remote_butler.retrieve_artifacts_zip(refs, destination) + return self._remote_butler.retrieve_artifacts_zip(refs, destination, overwrite) def retrieveArtifacts( self, From 5082563c44abed775b9794bd17883fd8bb6c1600 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 15:34:05 -0700 Subject: [PATCH 34/39] Raise rather than use assert --- .../daf/butler/datastores/file_datastore/retrieve_artifacts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 583d4ede16..fd76d0ad45 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -365,7 +365,8 @@ def calc_relative_paths( for p in paths: # It is an error if there is no relative path. rel = p.relative_to(root) - assert rel is not None + if rel is None: + raise RuntimeError(f"Unexepectedly unable to calculate relative path of {p} to {root}.") file_to_relative[p] = rel return file_to_relative From 82cf78439a2fdaa7738c4bafc2c68b992ced16a8 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 16:11:33 -0700 Subject: [PATCH 35/39] Add tests for reading V1 ZipIndex JSON model --- tests/data/zip_index.json | 305 ++++++++++++++++++++++++++++++++++++++ tests/test_datasets.py | 31 ++++ 2 files changed, 336 insertions(+) create mode 100644 tests/data/zip_index.json diff --git a/tests/data/zip_index.json b/tests/data/zip_index.json new file mode 100644 index 0000000000..3c37b39a51 --- /dev/null +++ b/tests/data/zip_index.json @@ -0,0 +1,305 @@ +{ + "index_version": "V1", + "refs": { + "container_version": "V1", + "universe_version": 7, + "universe_namespace": "daf_butler", + "dataset_types": { + "raw": { + "name": "raw", + "storageClass": "Exposure", + "dimensions": [ + "instrument", + "detector", + "exposure" + ], + "parentStorageClass": null, + "isCalibration": false + }, + "calexp": { + "name": "calexp", + "storageClass": "ExposureF", + "dimensions": [ + "instrument", + "detector", + "visit" + ], + "parentStorageClass": null, + "isCalibration": false + }, + "isr_metadata": { + "name": "isr_metadata", + "storageClass": "TaskMetadata", + "dimensions": [ + "instrument", + "detector", + "exposure" + ], + "parentStorageClass": null, + "isCalibration": false + } + }, + "compact_refs": { + "3f452f1d-b8a5-5202-8190-4e723004a6ba": { + "dataset_type_name": "raw", + "run": "DECam/raw/all", + "data_id": { + "instrument": "DECam", + "detector": 56, + "exposure": 411371 + } + }, + "5d32ede2-f2ca-5d3b-b900-bba4a00c3bf9": { + "dataset_type_name": "raw", + "run": "DECam/raw/all", + "data_id": { + "instrument": "DECam", + "detector": 60, + "exposure": 411371 + } + }, + "d24d9a3c-1004-42e7-88bb-a38ebf364084": { + "dataset_type_name": "calexp", + "run": "demo_collection", + "data_id": { + "instrument": "HSC", + "detector": 10, + "visit": 903342 + } + }, + "ac08556d-d6ab-43c5-9307-3bb7132548fd": { + "dataset_type_name": "isr_metadata", + "run": "demo_collection", + "data_id": { + "instrument": "HSC", + "detector": 10, + "exposure": 903342 + } + } + } + }, + "artifact_map": { + "Users/timj/work/lsstsw/build/testdata_decam/hits2015-zeroed/c4d_150218_052850_ori.fits.fz": { + "info": { + "formatter": "lsst.obs.decam.rawFormatter.DarkEnergyCameraRawFormatter", + "path": "file:///Users/timj/work/lsstsw/build/testdata_decam/hits2015-zeroed/c4d_150218_052850_ori.fits.fz", + "storage_class": "Exposure", + "component": null, + "checksum": null, + "file_size": 198720 + }, + "ids": [ + "3f452f1d-b8a5-5202-8190-4e723004a6ba", + "5d32ede2-f2ca-5d3b-b900-bba4a00c3bf9" + ] + }, + "demo_collection/calexp.apCorrMap/20130617/r/HSC-R/903342/calexp_apCorrMap_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.apCorrMap/20130617/r/HSC-R/903342/calexp_apCorrMap_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "ApCorr", + "component": "apCorrMap", + "checksum": null, + "file_size": 46080 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.detector/20130617/r/HSC-R/903342/calexp_detector_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.detector/20130617/r/HSC-R/903342/calexp_detector_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "Detector", + "component": "detector", + "checksum": null, + "file_size": 1725120 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.filter/20130617/r/HSC-R/903342/calexp_filter_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.filter/20130617/r/HSC-R/903342/calexp_filter_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "FilterLabel", + "component": "filter", + "checksum": null, + "file_size": 17280 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.id/20130617/r/HSC-R/903342/calexp_id_HSC_r_HSC-R_903342_1_36_demo_collection.json": { + "info": { + "formatter": "lsst.daf.butler.formatters.json.JsonFormatter", + "path": "demo_collection/calexp.id/20130617/r/HSC-R/903342/calexp_id_HSC_r_HSC-R_903342_1_36_demo_collection.json", + "storage_class": "int", + "component": "id", + "checksum": null, + "file_size": 9 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.image/20130617/r/HSC-R/903342/calexp_image_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsExposure.FitsImageFormatter", + "path": "demo_collection/calexp.image/20130617/r/HSC-R/903342/calexp_image_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "ImageF", + "component": "image", + "checksum": null, + "file_size": 31271040 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.mask/20130617/r/HSC-R/903342/calexp_mask_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsExposure.FitsMaskFormatter", + "path": "demo_collection/calexp.mask/20130617/r/HSC-R/903342/calexp_mask_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "MaskX", + "component": "mask", + "checksum": null, + "file_size": 627840 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.metadata/20130617/r/HSC-R/903342/calexp_metadata_HSC_r_HSC-R_903342_1_36_demo_collection.yaml": { + "info": { + "formatter": "lsst.daf.butler.formatters.yaml.YamlFormatter", + "path": "demo_collection/calexp.metadata/20130617/r/HSC-R/903342/calexp_metadata_HSC_r_HSC-R_903342_1_36_demo_collection.yaml", + "storage_class": "PropertyList", + "component": "metadata", + "checksum": null, + "file_size": 14887 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.photoCalib/20130617/r/HSC-R/903342/calexp_photoCalib_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.photoCalib/20130617/r/HSC-R/903342/calexp_photoCalib_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "PhotoCalib", + "component": "photoCalib", + "checksum": null, + "file_size": 28800 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.psf/20130617/r/HSC-R/903342/calexp_psf_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.psf/20130617/r/HSC-R/903342/calexp_psf_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "Psf", + "component": "psf", + "checksum": null, + "file_size": 192960 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.summaryStats/20130617/r/HSC-R/903342/calexp_summaryStats_HSC_r_HSC-R_903342_1_36_demo_collection.yaml": { + "info": { + "formatter": "lsst.daf.butler.formatters.yaml.YamlFormatter", + "path": "demo_collection/calexp.summaryStats/20130617/r/HSC-R/903342/calexp_summaryStats_HSC_r_HSC-R_903342_1_36_demo_collection.yaml", + "storage_class": "ExposureSummaryStats", + "component": "summaryStats", + "checksum": null, + "file_size": 1332 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.transmissionCurve/20130617/r/HSC-R/903342/calexp_transmissionCurve_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.transmissionCurve/20130617/r/HSC-R/903342/calexp_transmissionCurve_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "TransmissionCurve", + "component": "transmissionCurve", + "checksum": null, + "file_size": 201600 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.validPolygon/20130617/r/HSC-R/903342/calexp_validPolygon_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.validPolygon/20130617/r/HSC-R/903342/calexp_validPolygon_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "Polygon", + "component": "validPolygon", + "checksum": null, + "file_size": 17280 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.variance/20130617/r/HSC-R/903342/calexp_variance_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsExposure.FitsImageFormatter", + "path": "demo_collection/calexp.variance/20130617/r/HSC-R/903342/calexp_variance_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "ImageF", + "component": "variance", + "checksum": null, + "file_size": 24782400 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.visitInfo/20130617/r/HSC-R/903342/calexp_visitInfo_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.visitInfo/20130617/r/HSC-R/903342/calexp_visitInfo_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "VisitInfo", + "component": "visitInfo", + "checksum": null, + "file_size": 28800 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/calexp.wcs/20130617/r/HSC-R/903342/calexp_wcs_HSC_r_HSC-R_903342_1_36_demo_collection.fits": { + "info": { + "formatter": "lsst.obs.base.formatters.fitsGeneric.FitsGenericFormatter", + "path": "demo_collection/calexp.wcs/20130617/r/HSC-R/903342/calexp_wcs_HSC_r_HSC-R_903342_1_36_demo_collection.fits", + "storage_class": "Wcs", + "component": "wcs", + "checksum": null, + "file_size": 25920 + }, + "ids": [ + "d24d9a3c-1004-42e7-88bb-a38ebf364084" + ] + }, + "demo_collection/isr_metadata/20130617/HSCA90334200/isr_metadata_HSC_HSC-R_HSCA90334200_1_36_demo_collection.json": { + "info": { + "formatter": "lsst.daf.butler.formatters.json.JsonFormatter", + "path": "demo_collection/isr_metadata/20130617/HSCA90334200/isr_metadata_HSC_HSC-R_HSCA90334200_1_36_demo_collection.json", + "storage_class": "TaskMetadata", + "component": null, + "checksum": null, + "file_size": 7686 + }, + "ids": [ + "ac08556d-d6ab-43c5-9307-3bb7132548fd" + ] + } + } +} diff --git a/tests/test_datasets.py b/tests/test_datasets.py index fc13898cde..df5b9725bd 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -26,6 +26,7 @@ # along with this program. If not, see . import copy +import os import pickle import unittest import uuid @@ -34,12 +35,17 @@ DataCoordinate, DatasetRef, DatasetType, + DimensionConfig, DimensionUniverse, FileDataset, StorageClass, StorageClassFactory, ) from lsst.daf.butler.datastore.stored_file_info import StoredFileInfo +from lsst.daf.butler.datastores.file_datastore.retrieve_artifacts import ZipIndex +from lsst.resources import ResourcePath + +TESTDIR = os.path.abspath(os.path.dirname(__file__)) """Tests for datasets module. """ @@ -739,5 +745,30 @@ def testFileDataset(self) -> None: FileDataset(path="other.yaml", refs=[ref, ref2]) +class ZipIndexTestCase(unittest.TestCase): + """Test that a ZipIndex can be read.""" + + def test_v1(self): + """Read a v1 serialization.""" + path = os.path.join(TESTDIR, "data", "zip_index.json") + with open(path) as fd: + index = ZipIndex.model_validate_json(fd.read()) + + self.assertEqual(index.index_version, "V1") + self.assertEqual(len(index), 17) + self.assertEqual(len(index.refs), 4) + + # Reconstruct the refs using the required universe. + universe_version = index.refs.universe_version + namespace = index.refs.universe_namespace + universe_path = ResourcePath( + f"resource://lsst.daf.butler/configs/old_dimensions/{namespace}_universe{universe_version}.yaml" + ) + dimension_config = DimensionConfig(universe_path) + universe = DimensionUniverse(dimension_config) + refs = index.refs.to_refs(universe=universe) + self.assertEqual(len(refs), 4) + + if __name__ == "__main__": unittest.main() From b9ee95b3d42c05568fa25e56ca576f82e71bd3a8 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 16:29:08 -0700 Subject: [PATCH 36/39] Sort refs when retrieving artifacts This is important to ensure that the same prefix is used each time if the artifact has multiple refs associated with it. --- python/lsst/daf/butler/datastores/fileDatastore.py | 4 +++- python/lsst/daf/butler/remote_butler/_remote_butler.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python/lsst/daf/butler/datastores/fileDatastore.py b/python/lsst/daf/butler/datastores/fileDatastore.py index 7928747bd1..177db5aa49 100644 --- a/python/lsst/daf/butler/datastores/fileDatastore.py +++ b/python/lsst/daf/butler/datastores/fileDatastore.py @@ -2118,7 +2118,9 @@ def retrieveArtifacts( # One artifact can be used by multiple DatasetRef. # e.g. DECam. artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} - for ref in refs: + # Sort to ensure that in many refs to one file situation the same + # ref is used for any prefix that might be added. + for ref in sorted(refs): prefix = str(ref.id)[:8] + "-" if add_prefix else "" for info in records[ref.id]: location = info.file_location(self.locationFactory) diff --git a/python/lsst/daf/butler/remote_butler/_remote_butler.py b/python/lsst/daf/butler/remote_butler/_remote_butler.py index 63d8b7c9be..8f6c8a9914 100644 --- a/python/lsst/daf/butler/remote_butler/_remote_butler.py +++ b/python/lsst/daf/butler/remote_butler/_remote_butler.py @@ -438,7 +438,9 @@ def _retrieve_artifacts( requested_ids = {ref.id for ref in refs} have_copied: dict[ResourcePath, ResourcePath] = {} artifact_map: dict[ResourcePath, ArtifactIndexInfo] = {} - for ref in refs: + # Sort to ensure that in many refs to one file situation the same + # ref is used for any prefix that might be added. + for ref in sorted(refs): prefix = str(ref.id)[:8] + "-" if add_prefix else "" file_info = _to_file_payload(self._get_file_info_for_ref(ref)).file_info for file in file_info: From bb9bab4e928adfe1207e393ec0eef7eb81a1d7b8 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 16:29:52 -0700 Subject: [PATCH 37/39] Enable compression of the index in the zip file --- .../daf/butler/datastores/file_datastore/retrieve_artifacts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index fd76d0ad45..8672db0476 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -558,7 +558,7 @@ def retrieve_and_zip( if not overwrite and zip_path.exists(): raise FileExistsError(f"Output Zip at {zip_path} already exists but cannot overwrite.") with zipfile.ZipFile(zip_path.ospath, "w") as zip: - zip.write(index_path.ospath, index_path.basename()) + zip.write(index_path.ospath, index_path.basename(), compress_type=zipfile.ZIP_DEFLATED) for path, name in index.calc_relative_paths(tmpdir_path, list(artifact_map)).items(): zip.write(path.ospath, name) From 8b8c250142d47519fa5416ad853278b111434bb7 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Tue, 29 Oct 2024 16:54:48 -0700 Subject: [PATCH 38/39] Make SerializedDatasetRefContainerV1 public --- python/lsst/daf/butler/_dataset_ref.py | 176 +++++++++++++++++- .../file_datastore/retrieve_artifacts.py | 164 +--------------- tests/test_datasets.py | 11 ++ 3 files changed, 193 insertions(+), 158 deletions(-) diff --git a/python/lsst/daf/butler/_dataset_ref.py b/python/lsst/daf/butler/_dataset_ref.py index 250e86e87e..9e25b98c14 100644 --- a/python/lsst/daf/butler/_dataset_ref.py +++ b/python/lsst/daf/butler/_dataset_ref.py @@ -34,13 +34,26 @@ "DatasetIdGenEnum", "DatasetRef", "SerializedDatasetRef", + "SerializedDatasetRefContainerV1", + "SerializedDatasetRefContainers", ] import enum +import logging import sys import uuid from collections.abc import Iterable, Mapping -from typing import TYPE_CHECKING, Any, ClassVar, Literal, Protocol, TypeAlias, runtime_checkable +from typing import ( + TYPE_CHECKING, + Annotated, + Any, + ClassVar, + Literal, + Protocol, + Self, + TypeAlias, + runtime_checkable, +) import pydantic from lsst.utils.classes import immutable @@ -50,7 +63,13 @@ from ._dataset_type import DatasetType, SerializedDatasetType from ._named import NamedKeyDict from .datastore.stored_file_info import StoredDatastoreItemInfo -from .dimensions import DataCoordinate, DimensionGroup, DimensionUniverse, SerializedDataCoordinate +from .dimensions import ( + DataCoordinate, + DimensionGroup, + DimensionUniverse, + SerializedDataCoordinate, + SerializedDataId, +) from .json import from_json_pydantic, to_json_pydantic from .persistence_context import PersistenceContextVars @@ -63,6 +82,9 @@ DatasetDatastoreRecords: TypeAlias = Mapping[str, list[StoredDatastoreItemInfo]] +_LOG = logging.getLogger(__name__) + + class AmbiguousDatasetError(Exception): """Raised when a `DatasetRef` is not resolved but should be. @@ -864,3 +886,153 @@ class associated with the dataset type of the other ref can be Cannot be changed after a `DatasetRef` is constructed. """ + + +class MinimalistSerializableDatasetRef(pydantic.BaseModel): + """Minimal information needed to define a DatasetRef. + + The ID is not included and is presumed to be the key to a mapping + to this information. + """ + + model_config = pydantic.ConfigDict(frozen=True) + + dataset_type_name: str + """Name of the dataset type.""" + + run: str + """Name of the RUN collection.""" + + data_id: SerializedDataId + """Data coordinate of this dataset.""" + + +class SerializedDatasetRefContainer(pydantic.BaseModel): + """Serializable model for a collection of DatasetRef. + + Dimension records are not included. + """ + + model_config = pydantic.ConfigDict(extra="allow", frozen=True) + container_version: str + + +class SerializedDatasetRefContainerV1(SerializedDatasetRefContainer): + """Serializable model for a collection of DatasetRef. + + Dimension records are not included. + """ + + container_version: Literal["V1"] = "V1" + + universe_version: int + """Dimension universe version.""" + + universe_namespace: str + """Dimension universe namespace.""" + + dataset_types: dict[str, SerializedDatasetType] + """Dataset types indexed by their name.""" + + compact_refs: dict[uuid.UUID, MinimalistSerializableDatasetRef] + """Minimal dataset ref information indexed by UUID.""" + + def __len__(self) -> int: + """Return the number of datasets in the container.""" + return len(self.compact_refs) + + @classmethod + def from_refs(cls, refs: Iterable[DatasetRef]) -> Self: + """Construct a serializable form from a list of `DatasetRef`. + + Parameters + ---------- + refs : `~collections.abc.Iterable` [ `DatasetRef` ] + The datasets to include in the container. + """ + # The serialized DatasetRef contains a lot of duplicated information. + # We also want to drop dimension records and assume that the records + # are already in the registry. + universe: DimensionUniverse | None = None + dataset_types: dict[str, SerializedDatasetType] = {} + compact_refs: dict[uuid.UUID, MinimalistSerializableDatasetRef] = {} + for ref in refs: + simple_ref = ref.to_simple() + dataset_type = simple_ref.datasetType + assert dataset_type is not None # For mypy + if universe is None: + universe = ref.datasetType.dimensions.universe + if (name := dataset_type.name) not in dataset_types: + dataset_types[name] = dataset_type + data_id = simple_ref.dataId + assert data_id is not None # For mypy + compact_refs[simple_ref.id] = MinimalistSerializableDatasetRef( + dataset_type_name=name, run=simple_ref.run, data_id=data_id.dataId + ) + if universe: + universe_version = universe.version + universe_namespace = universe.namespace + else: + # No refs so no universe. + universe_version = 0 + universe_namespace = "unknown" + return cls( + universe_version=universe_version, + universe_namespace=universe_namespace, + dataset_types=dataset_types, + compact_refs=compact_refs, + ) + + def to_refs(self, universe: DimensionUniverse) -> list[DatasetRef]: + """Construct the original `DatasetRef`. + + Parameters + ---------- + universe : `DimensionUniverse` + The universe to use when constructing the `DatasetRef`. + + Returns + ------- + refs : `list` [ `DatasetRef` ] + The `DatasetRef` that were serialized. + """ + if not self.compact_refs: + return [] + + if universe.namespace != self.universe_namespace: + raise RuntimeError( + f"Can not convert to refs in universe {universe.namespace} that were created from " + f"universe {self.universe_namespace}" + ) + + if universe.version != self.universe_version: + _LOG.warning( + "Universe mismatch when attempting to reconstruct DatasetRef from serialized form. " + "Serialized with version %d but asked to use version %d.", + self.universe_version, + universe.version, + ) + + # Reconstruct the DatasetType objects. + dataset_types = { + name: DatasetType.from_simple(dtype, universe=universe) + for name, dtype in self.dataset_types.items() + } + refs: list[DatasetRef] = [] + for id_, minimal in self.compact_refs.items(): + simple_data_id = SerializedDataCoordinate(dataId=minimal.data_id) + data_id = DataCoordinate.from_simple(simple=simple_data_id, universe=universe) + ref = DatasetRef( + id=id_, + run=minimal.run, + datasetType=dataset_types[minimal.dataset_type_name], + dataId=data_id, + ) + refs.append(ref) + return refs + + +SerializedDatasetRefContainers: TypeAlias = Annotated[ + SerializedDatasetRefContainerV1, + pydantic.Field(discriminator="container_version"), +] diff --git a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py index 8672db0476..c143e82712 100644 --- a/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py +++ b/python/lsst/daf/butler/datastores/file_datastore/retrieve_artifacts.py @@ -34,169 +34,21 @@ import uuid import zipfile from collections.abc import Iterable -from typing import Annotated, ClassVar, Literal, Protocol, Self, TypeAlias +from typing import ClassVar, Literal, Protocol, Self -from lsst.daf.butler import DatasetIdFactory, DatasetRef +from lsst.daf.butler import ( + DatasetIdFactory, + DatasetRef, + SerializedDatasetRefContainers, + SerializedDatasetRefContainerV1, +) from lsst.daf.butler.datastore.stored_file_info import SerializedStoredFileInfo from lsst.resources import ResourcePath, ResourcePathExpression -from pydantic import BaseModel, ConfigDict, Field - -from ..._dataset_type import DatasetType, SerializedDatasetType -from ...dimensions import DataCoordinate, DimensionUniverse, SerializedDataCoordinate, SerializedDataId +from pydantic import BaseModel _LOG = logging.getLogger(__name__) -class MinimalistDatasetRef(BaseModel): - """Minimal information needed to define a DatasetRef. - - The ID is not included and is presumed to be the key to a mapping - to this information. - """ - - model_config = ConfigDict(frozen=True) - - dataset_type_name: str - """Name of the dataset type.""" - - run: str - """Name of the RUN collection.""" - - data_id: SerializedDataId - """Data coordinate of this dataset.""" - - -class SerializedDatasetRefContainer(BaseModel): - """Serializable model for a collection of DatasetRef. - - Dimension records are not included. - """ - - model_config = ConfigDict(extra="allow", frozen=True) - container_version: str - - -class SerializedDatasetRefContainerV1(SerializedDatasetRefContainer): - """Serializable model for a collection of DatasetRef. - - Dimension records are not included. - """ - - container_version: Literal["V1"] = "V1" - - universe_version: int - """Dimension universe version.""" - - universe_namespace: str - """Dimension universe namespace.""" - - dataset_types: dict[str, SerializedDatasetType] - """Dataset types indexed by their name.""" - - compact_refs: dict[uuid.UUID, MinimalistDatasetRef] - """Minimal dataset ref information indexed by UUID.""" - - def __len__(self) -> int: - """Return the number of datasets in the container.""" - return len(self.compact_refs) - - @classmethod - def from_refs(cls, refs: Iterable[DatasetRef]) -> Self: - """Construct a serializable form from a list of `DatasetRef`. - - Parameters - ---------- - refs : `~collections.abc.Iterable` [ `DatasetRef` ] - The datasets to include in the container. - """ - # The serialized DatasetRef contains a lot of duplicated information. - # We also want to drop dimension records and assume that the records - # are already in the registry. - universe: DimensionUniverse | None = None - dataset_types: dict[str, SerializedDatasetType] = {} - compact_refs: dict[uuid.UUID, MinimalistDatasetRef] = {} - for ref in refs: - simple_ref = ref.to_simple() - dataset_type = simple_ref.datasetType - assert dataset_type is not None # For mypy - if universe is None: - universe = ref.datasetType.dimensions.universe - if (name := dataset_type.name) not in dataset_types: - dataset_types[name] = dataset_type - data_id = simple_ref.dataId - assert data_id is not None # For mypy - compact_refs[simple_ref.id] = MinimalistDatasetRef( - dataset_type_name=name, run=simple_ref.run, data_id=data_id.dataId - ) - if universe: - universe_version = universe.version - universe_namespace = universe.namespace - else: - # No refs so no universe. - universe_version = 0 - universe_namespace = "unknown" - return cls( - universe_version=universe_version, - universe_namespace=universe_namespace, - dataset_types=dataset_types, - compact_refs=compact_refs, - ) - - def to_refs(self, universe: DimensionUniverse) -> list[DatasetRef]: - """Construct the original `DatasetRef`. - - Parameters - ---------- - universe : `DimensionUniverse` - The universe to use when constructing the `DatasetRef`. - - Returns - ------- - refs : `list` [ `DatasetRef` ] - The `DatasetRef` that were serialized. - """ - if not self.compact_refs: - return [] - - if universe.namespace != self.universe_namespace: - raise RuntimeError( - f"Can not convert to refs in universe {universe.namespace} that were created from " - f"universe {self.universe_namespace}" - ) - - if universe.version != self.universe_version: - _LOG.warning( - "Universe mismatch when attempting to reconstruct DatasetRef from serialized form. " - "Serialized with version %d but asked to use version %d.", - self.universe_version, - universe.version, - ) - - # Reconstruct the DatasetType objects. - dataset_types = { - name: DatasetType.from_simple(dtype, universe=universe) - for name, dtype in self.dataset_types.items() - } - refs: list[DatasetRef] = [] - for id_, minimal in self.compact_refs.items(): - simple_data_id = SerializedDataCoordinate(dataId=minimal.data_id) - data_id = DataCoordinate.from_simple(simple=simple_data_id, universe=universe) - ref = DatasetRef( - id=id_, - run=minimal.run, - datasetType=dataset_types[minimal.dataset_type_name], - dataId=data_id, - ) - refs.append(ref) - return refs - - -SerializedDatasetRefContainers: TypeAlias = Annotated[ - SerializedDatasetRefContainerV1, - Field(discriminator="container_version"), -] - - class ArtifactIndexInfo(BaseModel): """Information related to an artifact in an index.""" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index df5b9725bd..0e8123254e 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -38,6 +38,7 @@ DimensionConfig, DimensionUniverse, FileDataset, + SerializedDatasetRefContainerV1, StorageClass, StorageClassFactory, ) @@ -744,6 +745,16 @@ def testFileDataset(self) -> None: with self.assertRaises(ValueError): FileDataset(path="other.yaml", refs=[ref, ref2]) + def test_container(self) -> None: + ref1 = DatasetRef(self.datasetType, self.dataId, run="somerun") + ref2 = ref1.replace(run="somerun2") + + container = SerializedDatasetRefContainerV1.from_refs([ref1, ref2]) + self.assertEqual(len(container), 2) + + new_refs = container.to_refs(universe=self.universe) + self.assertEqual(new_refs, [ref1, ref2]) + class ZipIndexTestCase(unittest.TestCase): """Test that a ZipIndex can be read.""" From c510c7feb7a67f41a4e6a879232e6ca2576290c1 Mon Sep 17 00:00:00 2001 From: Tim Jenness Date: Wed, 30 Oct 2024 13:34:25 -0700 Subject: [PATCH 39/39] Add news fragment --- doc/changes/DM-46776.feature.rst | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 doc/changes/DM-46776.feature.rst diff --git a/doc/changes/DM-46776.feature.rst b/doc/changes/DM-46776.feature.rst new file mode 100644 index 0000000000..3d4e8d0015 --- /dev/null +++ b/doc/changes/DM-46776.feature.rst @@ -0,0 +1,7 @@ +* Added ``Butler.retrieve_artifacts_zip`` and ``QuantumBackedButler.retrieve_artifacts_zip`` methods to retrieve the dataset artifacts and store them into a zip file. +* Added ``Butler.ingest_zip`` to ingest the contents of a Zip file. +* Added ``SerializedDatasetRefContainerV1`` class to allow a collection of ``DatasetRef`` to be serialized efficiently. + JSON serializations made using this class will be supported. +* Added ``--zip`` parameter to ``butler retrieve-artifacts``. +* Changed ``Butler.retrieveArtifacts`` to always write a JSON index file describing where the artifacts came from. +* Added a ``butler ingest-zip`` command-line tool for ingesting zip files created by ``butler retrieve-artifacts``.