From e80f50968a6120cf0bf6fcd85dac6cf2f12a9ad4 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 9 Oct 2024 09:51:33 -0400 Subject: [PATCH] [python] Let registrar provide new shapes for resize (#3152) * [python] Complete 3140 * add unit-test case * [python] Let registration provide new shapes for resize [skip ci] * unit-test cases --- .../_registration/ambient_label_mappings.py | 22 +++++++++++- .../tests/test_registration_mappings.py | 34 +++++++++++++++++-- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/apis/python/src/tiledbsoma/io/_registration/ambient_label_mappings.py b/apis/python/src/tiledbsoma/io/_registration/ambient_label_mappings.py index b391ddbc74..5d8723ffd2 100644 --- a/apis/python/src/tiledbsoma/io/_registration/ambient_label_mappings.py +++ b/apis/python/src/tiledbsoma/io/_registration/ambient_label_mappings.py @@ -354,7 +354,7 @@ def _acquire_experiment_mappings( if experiment_uri is not None: if not tiledbsoma.Experiment.exists(experiment_uri, context=context): - raise ValueError("cannot find experiment at URI {experiment_uri}") + raise ValueError(f"cannot find experiment at URI {experiment_uri}") # Pre-check with tiledbsoma.Experiment.open(experiment_uri, context=context) as exp: @@ -488,6 +488,26 @@ def __str__(self) -> str: lines.append(f"{k}/var:{len(v.data)}") return "\n".join(lines) + def get_obs_shape(self) -> int: + """Reports the new obs shape which the experiment will need to be + resized to in order to accommodate the data contained within the + registration.""" + if len(self.obs_axis.data.values()) == 0: + return 0 + return 1 + max(self.obs_axis.data.values()) + + def get_var_shapes(self) -> Dict[str, int]: + """Reports the new var shapes, one per measurement, which the experiment + will need to be resized to in order to accommodate the data contained + within the registration.""" + retval: Dict[str, int] = {} + for key, axis in self.var_axes.items(): + if len(axis.data.values()) == 0: + retval[key] = 0 + else: + retval[key] = 1 + max(axis.data.values()) + return retval + def to_json(self) -> str: return json.dumps(self, default=attrs.asdict, sort_keys=True, indent=4) diff --git a/apis/python/tests/test_registration_mappings.py b/apis/python/tests/test_registration_mappings.py index 45065b6a68..d5fd8b392a 100644 --- a/apis/python/tests/test_registration_mappings.py +++ b/apis/python/tests/test_registration_mappings.py @@ -242,8 +242,8 @@ def test_pandas_indexing( signature_col_names: List[Union[str, Tuple[str, str]]], ): """ - The `default_index_name` for registration can interact with column- and index-names in a variety of ways; this test - exercises several of them. + The `default_index_name` for registration can interact with column- and + index-names in a variety of ways; this test exercises several of them. """ df = PANDAS_INDEXING_TEST_DF.copy() index_col = index_col_and_name[0] @@ -300,6 +300,9 @@ def test_isolated_anndata_mappings(obs_field_name, var_field_name): ["RAW2", "TP53", "VEGFA"] ).data == (6, 3, 4) + assert rd.get_obs_shape() == 3 + assert rd.get_var_shapes() == {"measname": 5, "raw": 7} + @pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"]) @pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"]) @@ -319,6 +322,9 @@ def test_isolated_h5ad_mappings(obs_field_name, var_field_name): ["RAW2", "TP53", "VEGFA"] ).data == (6, 3, 4) + assert rd.get_obs_shape() == 3 + assert rd.get_var_shapes() == {"measname": 5, "raw": 7} + @pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"]) @pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"]) @@ -337,6 +343,9 @@ def test_isolated_soma_experiment_mappings(obs_field_name, var_field_name): ["RAW2", "TP53", "VEGFA"] ).data == (6, 3, 4) + assert rd.get_obs_shape() == 3 + assert rd.get_var_shapes() == {"measname": 5, "raw": 7} + @pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"]) @pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"]) @@ -430,6 +439,9 @@ def test_multiples_without_experiment( "ZZZ3": 9, } + assert rd.get_obs_shape() == 12 + assert rd.get_var_shapes() == {"measname": 7, "raw": 10} + # Now do the ingestion per se. Note that once registration is done sequentially, ingest order # mustn't matter, and in fact, can be done in parallel. This is why we test various permutations # of the ordering of the h5ad file names. @@ -677,6 +689,9 @@ def test_multiples_with_experiment(obs_field_name, var_field_name): "ZZZ3": 9, } + assert rd.get_obs_shape() == 12 + assert rd.get_var_shapes() == {"measname": 7, "raw": 10} + @pytest.mark.parametrize("obs_field_name", ["obs_id", "cell_id"]) @pytest.mark.parametrize("var_field_name", ["var_id", "gene_id"]) @@ -691,6 +706,9 @@ def test_append_items_with_experiment(obs_field_name, var_field_name): var_field_name=var_field_name, ) + assert rd.get_obs_shape() == 6 + assert rd.get_var_shapes() == {"measname": 5, "raw": 7} + adata2 = ad.read_h5ad(h5ad2) original = adata2.copy() @@ -1054,6 +1072,9 @@ def test_registration_with_batched_reads(tmp_path, soma_larger, use_small_buffer assert len(rd.obs_axis.data) == 1000 + assert rd.get_obs_shape() == 1000 + assert rd.get_var_shapes() == {"measname": 6} + def test_ealm_expose(): """Checks that this is exported from tiledbsoma.io._registration""" @@ -1163,6 +1184,9 @@ def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b): var_field_name=var_field_name, ) + assert rd.get_obs_shape() == nobs_a + nobs_b + assert rd.get_var_shapes() == {"meas": 4, "raw": 0} + tiledbsoma.io.from_anndata( soma_uri, adata, measurement_name=measurement_name, registration_mapping=rd ) @@ -1181,6 +1205,9 @@ def test_enum_bit_width_append(tmp_path, all_at_once, nobs_a, nobs_b): var_field_name=var_field_name, ) + assert rd.get_obs_shape() == nobs_a + nobs_b + assert rd.get_var_shapes() == {"meas": 4} + tiledbsoma.io.from_anndata( soma_uri, bdata, measurement_name=measurement_name, registration_mapping=rd ) @@ -1256,6 +1283,9 @@ def test_multimodal_names(tmp_path, conftest_pbmc3k_adata): var_field_name=adata_protein.var.index.name, ) + assert rd.get_obs_shape() == 2638 + assert rd.get_var_shapes() == {"protein": 500, "raw": 13714} + # Ingest the second anndata object into the protein measurement tiledbsoma.io.from_anndata( experiment_uri=uri,