From 7fd49a14417fcce6185b3d5d51b739328a00f91d Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 10 Oct 2024 17:12:25 -0700 Subject: [PATCH] Rename filter_list -> filters --- apis/python/src/tiledbsoma/soma_array.cc | 6 +- apis/python/tests/test_dataframe.py | 70 +++++++++++----------- apis/python/tests/test_dense_nd_array.py | 4 +- apis/python/tests/test_sparse_nd_array.py | 71 +++++++++-------------- libtiledbsoma/src/utils/arrow_adapter.cc | 4 +- libtiledbsoma/test/unit_soma_dataframe.cc | 7 +-- 6 files changed, 74 insertions(+), 88 deletions(-) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index 5f68f1a79b..5f40f8bc26 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -946,7 +946,11 @@ void load_soma_array(py::module& m) { .def_property_readonly("dimension_names", &SOMAArray::dimension_names) - .def("consolidate_and_vacuum", &SOMAArray::consolidate_and_vacuum) + .def( + "consolidate_and_vacuum", + &SOMAArray::consolidate_and_vacuum, + py::arg( + "modes") = std::vector{"fragment_meta", "commits"}) .def_property_readonly( "meta", diff --git a/apis/python/tests/test_dataframe.py b/apis/python/tests/test_dataframe.py index 44d577f36c..09b8862069 100644 --- a/apis/python/tests/test_dataframe.py +++ b/apis/python/tests/test_dataframe.py @@ -1,6 +1,7 @@ import contextlib import datetime -import os +import json +from pathlib import Path from typing import Dict, List import numpy as np @@ -145,13 +146,15 @@ def test_dataframe(tmp_path, arrow_schema): assert [e.as_py() for e in table["baz"]] == pydict["baz"] assert [e.as_py() for e in table["quux"]] == pydict["quux"] - # Validate TileDB array schema - if hastiledb: - with tiledb.open(uri) as A: - assert A.schema.sparse - assert not A.schema.allows_duplicates - assert A.dim("foo").filters == [tiledb.ZstdFilter(level=3)] - assert A.attr("bar").filters == [tiledb.ZstdFilter()] + with soma.DataFrame.open(uri) as A: + cfg = A.config_options_from_schema() + assert not cfg.allows_duplicates + assert json.loads(cfg.dims)["foo"]["filters"] == [ + {"COMPRESSION_LEVEL": 3, "name": "ZSTD"} + ] + assert json.loads(cfg.attrs)["bar"]["filters"] == [ + {"COMPRESSION_LEVEL": -1, "name": "ZSTD"} + ] with soma.DataFrame.open(uri) as sdf: assert sdf.count == 5 @@ -1054,34 +1057,27 @@ def test_result_order(tmp_path): next(sdf.read(result_order="bogus")) -@pytest.mark.skipif(not hastiledb, reason="tiledb-py not installed") @pytest.mark.parametrize( "create_options,expected_schema_fields", ( ( {"allows_duplicates": True}, { - "validity_filters": ( - tiledb.FilterList([tiledb.RleFilter()]) if hastiledb else None - ), + "validity_filters": [{"COMPRESSION_LEVEL": -1, "name": "RLE"}], "allows_duplicates": True, }, ), ( {"allows_duplicates": False}, { - "validity_filters": ( - tiledb.FilterList([tiledb.RleFilter()]) if hastiledb else None - ), + "validity_filters": [{"COMPRESSION_LEVEL": -1, "name": "RLE"}], "allows_duplicates": False, }, ), ( {"validity_filters": ["NoOpFilter"], "allows_duplicates": False}, { - "validity_filters": ( - tiledb.FilterList([tiledb.NoOpFilter()]) if hastiledb else None - ), + "validity_filters": [{"name": "NOOP"}], "allows_duplicates": False, }, ), @@ -1096,14 +1092,17 @@ def test_create_platform_config_overrides( schema=pa.schema([pa.field("colA", pa.string())]), platform_config={"tiledb": {"create": {**create_options}}}, ).close() - with tiledb.open(uri) as D: - for k, v in expected_schema_fields.items(): - assert getattr(D.schema, k) == v + + with soma.DataFrame.open(tmp_path.as_posix()) as A: + cfg = A.config_options_from_schema() + assert expected_schema_fields["validity_filters"] == json.loads( + cfg.validity_filters + ) + assert expected_schema_fields["allows_duplicates"] == cfg.allows_duplicates @pytest.mark.parametrize("allows_duplicates", [False, True]) @pytest.mark.parametrize("consolidate", [False, True]) -@pytest.mark.skipif(not hastiledb, reason="tiledb-py not installed") def test_timestamped_ops(tmp_path, allows_duplicates, consolidate): uri = tmp_path.as_posix() @@ -1140,21 +1139,21 @@ def test_timestamped_ops(tmp_path, allows_duplicates, consolidate): "float": [200.2, 300.3], "string": ["ball", "cat"], } - sidf.write(pa.Table.from_pydict(data)) + + # Without consolidate: + # * There are two fragments: + # o One with tiledb.fragment.FragmentInfoList[i].timestamp_range = (10, 10) + # o One with tiledb.fragment.FragmentInfoList[i].timestamp_range = (20, 20) + # With consolidate: + # * There is one fragment: + # o One with tiledb.fragment.FragmentInfoList[i].timestamp_range = (10, 20) + sidf.write( + pa.Table.from_pydict(data), + soma.TileDBWriteOptions(consolidate_and_vacuum=consolidate), + ) assert sidf.tiledb_timestamp_ms == 1615403005000 assert sidf.tiledb_timestamp.isoformat() == "2021-03-10T19:03:25+00:00" - # Without consolidate: - # * There are two fragments: - # o One with tiledb.fragment.FragmentInfoList[i].timestamp_range = (10, 10) - # o One with tiledb.fragment.FragmentInfoList[i].timestamp_range = (20, 20) - # With consolidate: - # * There is one fragment: - # o One with tiledb.fragment.FragmentInfoList[i].timestamp_range = (10, 20) - if consolidate: - tiledb.consolidate(uri) - tiledb.vacuum(uri) - # read without timestamp (i.e., after final write) & see final image with soma.DataFrame.open(uri) as sidf: table = sidf.read().concat() @@ -1735,10 +1734,9 @@ def test_only_evolve_schema_when_enmr_is_extended(tmp_path): # total 3 fragment files - vfs = tiledb.VFS() # subtract 1 for the __schema/__enumerations directory; # only looking at fragment files - assert len(vfs.ls(os.path.join(uri, "__schema"))) - 1 == 3 + assert len(list((Path(uri) / "__schema").iterdir())) - 1 == 3 def test_fix_update_dataframe_with_var_strings(tmp_path): diff --git a/apis/python/tests/test_dense_nd_array.py b/apis/python/tests/test_dense_nd_array.py index 8e7b6c4e05..87b7c29415 100644 --- a/apis/python/tests/test_dense_nd_array.py +++ b/apis/python/tests/test_dense_nd_array.py @@ -1,12 +1,12 @@ import contextlib import datetime +import json import pathlib from typing import Tuple import numpy as np import pyarrow as pa import pytest -import json import tiledbsoma as soma from tiledbsoma.options import SOMATileDBContext @@ -494,4 +494,4 @@ def test_read_to_unwritten_array(tmp_path, shape): soma.DenseNDArray.create(uri, type=pa.uint8(), shape=shape) with soma.DenseNDArray.open(uri, "r") as A: - assert np.array_equal(np.ones(shape)*255, A.read().to_numpy()) + assert np.array_equal(np.ones(shape) * 255, A.read().to_numpy()) diff --git a/apis/python/tests/test_sparse_nd_array.py b/apis/python/tests/test_sparse_nd_array.py index 079b1124c8..1a4cf6adda 100644 --- a/apis/python/tests/test_sparse_nd_array.py +++ b/apis/python/tests/test_sparse_nd_array.py @@ -3,6 +3,7 @@ import contextlib import datetime import itertools +import json import operator import pathlib import sys @@ -19,13 +20,6 @@ from tiledbsoma import _factory from tiledbsoma.options import SOMATileDBContext -try: - import tiledb - - hastiledb = True -except ModuleNotFoundError: - hastiledb = False - from . import NDARRAY_ARROW_TYPES_NOT_SUPPORTED, NDARRAY_ARROW_TYPES_SUPPORTED from ._util import raises_no_typeguard @@ -331,11 +325,9 @@ def test_sparse_nd_array_read_write_sparse_tensor( assert t.shape == shape - # Validate TileDB array schema - if hastiledb: - with tiledb.open(tmp_path.as_posix()) as A: - assert A.schema.sparse - assert not A.schema.allows_duplicates + with soma.SparseNDArray.open(tmp_path.as_posix()) as A: + assert A.is_sparse + assert not A.config_options_from_schema().allows_duplicates @pytest.mark.parametrize("shape", [(10,), (23, 4), (5, 3, 1), (8, 4, 2, 30)]) @@ -357,11 +349,9 @@ def test_sparse_nd_array_read_write_table( assert isinstance(t, pa.Table) assert tables_are_same_value(data, t) - # Validate TileDB array schema - if hastiledb: - with tiledb.open(tmp_path.as_posix()) as A: - assert A.schema.sparse - assert not A.schema.allows_duplicates + with soma.SparseNDArray.open(tmp_path.as_posix()) as A: + assert A.is_sparse + assert not A.config_options_from_schema().allows_duplicates @pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64]) @@ -387,11 +377,9 @@ def test_sparse_nd_array_read_as_pandas( data.to_pandas().sort_values(by=dim_names, ignore_index=True) ) - # Validate TileDB array schema - if hastiledb: - with tiledb.open(tmp_path.as_posix()) as A: - assert A.schema.sparse - assert not A.schema.allows_duplicates + with soma.SparseNDArray.open(tmp_path.as_posix()) as A: + assert A.is_sparse + assert not A.config_options_from_schema().allows_duplicates @pytest.mark.parametrize("shape_is_nones", [True, False]) @@ -1096,14 +1084,14 @@ def test_tile_extents(tmp_path): }, ).close() - if hastiledb: - with tiledb.open(uri) as A: - if soma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED: - assert A.schema.domain.dim(0).tile == 2048 - assert A.schema.domain.dim(1).tile == 2048 - else: - assert A.schema.domain.dim(0).tile == 100 - assert A.schema.domain.dim(1).tile == 2048 + with soma.SparseNDArray.open(tmp_path.as_posix()) as A: + dim_info = json.loads(A.config_options_from_schema().dims) + if soma._flags.NEW_SHAPE_FEATURE_FLAG_ENABLED: + assert int(dim_info["soma_dim_0"]["tile"]) == 2048 + assert int(dim_info["soma_dim_1"]["tile"]) == 2048 + else: + assert int(dim_info["soma_dim_0"]["tile"]) == 100 + assert int(dim_info["soma_dim_1"]["tile"]) == 2048 @pytest.mark.parametrize( @@ -1112,33 +1100,26 @@ def test_tile_extents(tmp_path): ( {"allows_duplicates": True}, { - "validity_filters": ( - tiledb.FilterList([tiledb.RleFilter()]) if hastiledb else None - ), + "validity_filters": [{"COMPRESSION_LEVEL": -1, "name": "RLE"}], "allows_duplicates": True, }, ), ( {"allows_duplicates": False}, { - "validity_filters": ( - tiledb.FilterList([tiledb.RleFilter()]) if hastiledb else None - ), + "validity_filters": [{"COMPRESSION_LEVEL": -1, "name": "RLE"}], "allows_duplicates": False, }, ), ( {"validity_filters": ["NoOpFilter"], "allows_duplicates": False}, { - "validity_filters": ( - tiledb.FilterList([tiledb.NoOpFilter()]) if hastiledb else None - ), + "validity_filters": [{"name": "NOOP"}], "allows_duplicates": False, }, ), ), ) -@pytest.mark.skipif(not hastiledb, reason="tiledb-py not installed") def test_create_platform_config_overrides( tmp_path, create_options, expected_schema_fields ): @@ -1149,9 +1130,13 @@ def test_create_platform_config_overrides( shape=(100, 100), platform_config={"tiledb": {"create": {**create_options}}}, ).close() - with tiledb.open(uri) as D: - for k, v in expected_schema_fields.items(): - assert getattr(D.schema, k) == v + + with soma.SparseNDArray.open(tmp_path.as_posix()) as A: + cfg = A.config_options_from_schema() + assert expected_schema_fields["validity_filters"] == json.loads( + cfg.validity_filters + ) + assert expected_schema_fields["allows_duplicates"] == cfg.allows_duplicates def test_timestamped_ops(tmp_path): diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index db28d37fb7..d176c81ce0 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -212,7 +212,7 @@ json ArrowAdapter::_get_attrs_filter_list_json( json attrs_filter_list_as_json; for (const auto& attr : tiledb_schema.attributes()) { json attr_info = { - {"filter_list", _get_filter_list_json(attr.second.filter_list())}}; + {"filters", _get_filter_list_json(attr.second.filter_list())}}; attrs_filter_list_as_json.emplace(attr.first, attr_info); } return attrs_filter_list_as_json; @@ -223,7 +223,7 @@ json ArrowAdapter::_get_dims_list_json(const ArraySchema& tiledb_schema) { for (const auto& dim : tiledb_schema.domain().dimensions()) { json dim_info = { {"tile", dim.tile_extent_to_str()}, - {"filter_list", _get_filter_list_json(dim.filter_list())}}; + {"filters", _get_filter_list_json(dim.filter_list())}}; dims_as_json.emplace(dim.name(), dim_info); } return dims_as_json; diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index dff84b8337..d481a86310 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -382,12 +382,11 @@ TEST_CASE_METHOD( Filter::to_str(filter.second)); if (filter.second != TILEDB_FILTER_WEBP) { REQUIRE( - json::parse(config_options.attrs)["a0"]["filter_list"][0] - .at("name") == Filter::to_str(filter.second)); + json::parse(config_options.attrs)["a0"]["filters"][0].at( + "name") == Filter::to_str(filter.second)); } REQUIRE( - json::parse( - config_options.dims)["soma_joinid"]["filter_list"][0] + json::parse(config_options.dims)["soma_joinid"]["filters"][0] .at("name") == Filter::to_str(TILEDB_FILTER_ZSTD)); sdf->close();