feat(memh5): add option to only include specific datasets when copying

radiocosmology · Jul 21, 2023 · fe5dfdb · fe5dfdb
1 parent fcd3cc1
commit fe5dfdb
Showing 1 changed file with 23 additions and 3 deletions.
diff --git a/caput/memh5.py b/caput/memh5.py
@@ -2038,7 +2038,9 @@ def flush(self):
         if self.ondisk:
             self._data.flush()
 
-    def copy(self, shared: list = [], shallow: bool = False) -> MemDiskGroup:
+    def copy(
+        self, shared: list = [], shallow: bool = False, include: list = []
+    ) -> MemDiskGroup:
         """Return a deep copy of this class or subclass.
 
         Parameters
@@ -2047,6 +2049,9 @@ def copy(self, shared: list = [], shallow: bool = False) -> MemDiskGroup:
             dataset names to share (i.e. don't deep copy)
         shallow
             True if this should be a shallow copy
+        include
+            If provided, only the datasets in this list will be included. Otherwise, all
+            datasets are copied. This does not change the behaviour of `shared`
 
         Returns
         -------
@@ -2056,7 +2061,11 @@ def copy(self, shared: list = [], shallow: bool = False) -> MemDiskGroup:
         cls = self.__class__.__new__(self.__class__)
         MemDiskGroup.__init__(cls, distributed=self.distributed, comm=self.comm)
         deep_group_copy(
-            self._data, cls._data, deep_copy_dsets=not shallow, shared=shared
+            self._data,
+            cls._data,
+            deep_copy_dsets=not shallow,
+            shared=shared,
+            include=include,
         )
 
         return cls
@@ -2523,6 +2532,7 @@ def deep_group_copy(
     postprocess=None,
     deep_copy_dsets=False,
     shared=[],
+    include=[],
 ):
     """Copy full data tree from one group to another.
 
@@ -2576,6 +2586,9 @@ def deep_group_copy(
         List of datasets to share, if `deep_copy_dsets` is True. Otherwise, no effect.
         Shared datasets just point to the existing object in g1 storage. Axis selections
         cannot be applied to shared datasets.
+    include : list, optional
+        If provided, only the datasets in this list will be included. Otherwise, all
+        datasets are copied. This does not change the behaviour of `shared`
 
     Returns
     -------
@@ -2699,6 +2712,7 @@ def _prepare_compression_args(dset):
 
     # Make sure shared dataset names are properly formatted
     shared = {"/" + k if k[0] != "/" else k for k in shared}
+    include = {"/" + k if k[0] != "/" else k for k in include}
 
     # Do a non-recursive traversal of the tree, recreating the structure and attributes,
     # and copying over any non-distributed datasets
@@ -2716,18 +2730,24 @@ def _prepare_compression_args(dset):
             stack += [entry[k] for k in sorted(entry, reverse=True)]
 
         elif key in shared:
+            if include and key not in include:
+                continue
+
             # Make sure that we aren't trying to apply a selection to this dataset
             if _get_selection(entry) != slice(None):
                 raise ValueError(
                     f"Cannot apply a selection to a shared dataset ({entry.name})"
                 )
-            # Just point to the existing dataset
+            # Point to the existing dataset
             parent_name, name = posixpath.split(posixpath.join(g2.name, key))
             parent_name = format_abs_path(parent_name)
             # Get the proper storage location for this dataset
             g2[parent_name]._get_storage()[name] = g1._get_storage()[key]
 
         else:
+            if include and key not in include:
+                continue
+
             # Copy over this dataset
             dset_args = _prepare_dataset(entry)
             compression_kwargs = _prepare_compression_args(entry)