From fe5dfdbc89ef3ff56b067f021ef532ff62cb882f Mon Sep 17 00:00:00 2001 From: ljgray Date: Fri, 21 Jul 2023 11:50:12 -0700 Subject: [PATCH] feat(memh5): add option to only include specific datasets when copying --- caput/memh5.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/caput/memh5.py b/caput/memh5.py index 2b5e0e4b..23916d67 100644 --- a/caput/memh5.py +++ b/caput/memh5.py @@ -2038,7 +2038,9 @@ def flush(self): if self.ondisk: self._data.flush() - def copy(self, shared: list = [], shallow: bool = False) -> MemDiskGroup: + def copy( + self, shared: list = [], shallow: bool = False, include: list = [] + ) -> MemDiskGroup: """Return a deep copy of this class or subclass. Parameters @@ -2047,6 +2049,9 @@ def copy(self, shared: list = [], shallow: bool = False) -> MemDiskGroup: dataset names to share (i.e. don't deep copy) shallow True if this should be a shallow copy + include + If provided, only the datasets in this list will be included. Otherwise, all + datasets are copied. This does not change the behaviour of `shared` Returns ------- @@ -2056,7 +2061,11 @@ def copy(self, shared: list = [], shallow: bool = False) -> MemDiskGroup: cls = self.__class__.__new__(self.__class__) MemDiskGroup.__init__(cls, distributed=self.distributed, comm=self.comm) deep_group_copy( - self._data, cls._data, deep_copy_dsets=not shallow, shared=shared + self._data, + cls._data, + deep_copy_dsets=not shallow, + shared=shared, + include=include, ) return cls @@ -2523,6 +2532,7 @@ def deep_group_copy( postprocess=None, deep_copy_dsets=False, shared=[], + include=[], ): """Copy full data tree from one group to another. @@ -2576,6 +2586,9 @@ def deep_group_copy( List of datasets to share, if `deep_copy_dsets` is True. Otherwise, no effect. Shared datasets just point to the existing object in g1 storage. Axis selections cannot be applied to shared datasets. + include : list, optional + If provided, only the datasets in this list will be included. Otherwise, all + datasets are copied. This does not change the behaviour of `shared` Returns ------- @@ -2699,6 +2712,7 @@ def _prepare_compression_args(dset): # Make sure shared dataset names are properly formatted shared = {"/" + k if k[0] != "/" else k for k in shared} + include = {"/" + k if k[0] != "/" else k for k in include} # Do a non-recursive traversal of the tree, recreating the structure and attributes, # and copying over any non-distributed datasets @@ -2716,18 +2730,24 @@ def _prepare_compression_args(dset): stack += [entry[k] for k in sorted(entry, reverse=True)] elif key in shared: + if include and key not in include: + continue + # Make sure that we aren't trying to apply a selection to this dataset if _get_selection(entry) != slice(None): raise ValueError( f"Cannot apply a selection to a shared dataset ({entry.name})" ) - # Just point to the existing dataset + # Point to the existing dataset parent_name, name = posixpath.split(posixpath.join(g2.name, key)) parent_name = format_abs_path(parent_name) # Get the proper storage location for this dataset g2[parent_name]._get_storage()[name] = g1._get_storage()[key] else: + if include and key not in include: + continue + # Copy over this dataset dset_args = _prepare_dataset(entry) compression_kwargs = _prepare_compression_args(entry)