From 098847e15c1242b76056a37766f3315a60b1dc0c Mon Sep 17 00:00:00 2001 From: ljgray Date: Tue, 25 Apr 2023 11:45:23 -0700 Subject: [PATCH] fix(memh5): block axis downselection in shared datasets --- caput/memh5.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/caput/memh5.py b/caput/memh5.py index d3ecf2c6..73889df9 100644 --- a/caput/memh5.py +++ b/caput/memh5.py @@ -2541,6 +2541,8 @@ def deep_group_copy( >>> list(g2["foo"]["bar"]) [0, 1] + Axis downselections cannot be applied to shared datasets. + Parameters ---------- g1 : h5py.Group or zarr.Group @@ -2550,7 +2552,7 @@ def deep_group_copy( selections : dict If this is not None, it should have a subset of the same hierarchical structure as g1, but ultimately describe axis selections for group entries as valid - numpy indexes. + numpy indexes. Selections cannot be applied to shared datasets. convert_attribute_strings : bool, optional Convert string attributes (or lists/arrays of them) to ensure that they are unicode. @@ -2567,12 +2569,11 @@ def deep_group_copy( entries, and can modify either. deep_copy_dsets : bool, optional Explicitly deep copy all datasets. This will only alter behaviour when copying - from memory to memory. XXX: enabling this in places where it is not currently - enabled could break legacy code, so be very careful + from memory to memory. shared : list, optional List of datasets to share, if `deep_copy_dsets` is True. Otherwise, no effect. - Shared datasets just point to the existing object in g1 storage, and override - any other behaviour + Shared datasets just point to the existing object in g1 storage. Axis selections + cannot be applied to shared datasets. Returns ------- @@ -2588,10 +2589,9 @@ def deep_group_copy( to_file = isinstance(g2, file_format.module.Group) - # Prepare a dataset for writing out, applying selections and transforming any - # datatypes - # Returns: dict(dtype, shape, data_to_write) - def _prepare_dataset(dset): + # Get the selection associated with this dataset + # Returns: slice + def _get_selection(dset): # Look for a selection for this dataset (also try without the leading "/") try: selection = selections.get( @@ -2600,6 +2600,14 @@ def _prepare_dataset(dset): except AttributeError: selection = slice(None) + return selection + + # Prepare a dataset for writing out, applying selections and transforming any + # datatypes + # Returns: dict(dtype, shape, data_to_write) + def _prepare_dataset(dset): + selection = _get_selection(dset) + # Check if this is a distributed dataset and figure out if we can make this work # out if to_file and isinstance(dset, MemDatasetDistributed): @@ -2706,6 +2714,11 @@ def _prepare_compression_args(dset): stack += [entry[k] for k in sorted(entry, reverse=True)] elif key in shared: + # Make sure that we aren't trying to apply a selection to this dataset + if _get_selection(entry) != slice(None): + raise ValueError( + f"Cannot apply a selection to a shared dataset ({entry.name})" + ) # Just point to the existing dataset parent_name, name = posixpath.split(posixpath.join(g2.name, key)) parent_name = format_abs_path(parent_name)