From 8bb0b3457bc31925e2ad0e737f1b29de9da74cbf Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 8 Jan 2025 00:24:06 -0800 Subject: [PATCH 1/2] add known bugs to work in progress section of the v3 migration guide (#2670) --- docs/user-guide/v3_migration.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst index d90b87a897..66fcca6d19 100644 --- a/docs/user-guide/v3_migration.rst +++ b/docs/user-guide/v3_migration.rst @@ -206,3 +206,5 @@ of Zarr-Python, please open (or comment on) a * Object dtypes (:issue:`2617`) * Ragged arrays (:issue:`2618`) * Groups and Arrays do not implement ``__enter__`` and ``__exit__`` protocols (:issue:`2619`) + * Big Endian dtypes (:issue:`2324`) + * Default filters for object dtypes for Zarr format 2 arrays (:issue:`2627`) From eb2542498e93613e85c9555dcd2ccc606378fd57 Mon Sep 17 00:00:00 2001 From: Will Moore Date: Wed, 8 Jan 2025 10:26:30 +0000 Subject: [PATCH 2/2] Fix json indent (#2546) * Fix usage of config json_indent in V3JsonEncoder * Add test for json_indent * parametrize json indent * Add None to indent test parameters * ruff fix * other ruff fixes * Update src/zarr/core/metadata/v3.py Co-authored-by: Joe Hamman * Use explicit json encoder args * Add types * Update byte counts for tests --------- Co-authored-by: Joe Hamman Co-authored-by: Deepak Cherian --- docs/user-guide/arrays.rst | 4 ++-- docs/user-guide/groups.rst | 4 ++-- docs/user-guide/performance.rst | 4 ++-- src/zarr/core/metadata/v3.py | 28 +++++++++++++++++++++++++--- tests/test_array.py | 25 ++++++++++++------------- tests/test_metadata/test_v3.py | 11 ++++++++++- 6 files changed, 53 insertions(+), 23 deletions(-) diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst index ba85ce1cda..ae2c4b47eb 100644 --- a/docs/user-guide/arrays.rst +++ b/docs/user-guide/arrays.rst @@ -209,7 +209,7 @@ prints additional diagnostics, e.g.:: Serializer : BytesCodec(endian=) Compressors : (BloscCodec(typesize=4, cname=, clevel=3, shuffle=, blocksize=0),) No. bytes : 400000000 (381.5M) - No. bytes stored : 9696302 + No. bytes stored : 9696520 Storage ratio : 41.3 Chunks Initialized : 100 @@ -611,7 +611,7 @@ Sharded arrays can be created by providing the ``shards`` parameter to :func:`za Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 100000000 (95.4M) - No. bytes stored : 3981060 + No. bytes stored : 3981552 Storage ratio : 25.1 Shards Initialized : 100 diff --git a/docs/user-guide/groups.rst b/docs/user-guide/groups.rst index da5f393246..1e72df3478 100644 --- a/docs/user-guide/groups.rst +++ b/docs/user-guide/groups.rst @@ -113,8 +113,8 @@ property. E.g.:: Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 8000000 (7.6M) - No. bytes stored : 1432 - Storage ratio : 5586.6 + No. bytes stored : 1614 + Storage ratio : 4956.6 Chunks Initialized : 0 >>> baz.info Type : Array diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst index 265bef8efe..42d830780f 100644 --- a/docs/user-guide/performance.rst +++ b/docs/user-guide/performance.rst @@ -131,7 +131,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) - No. bytes stored : 342588717 + No. bytes stored : 342588911 Storage ratio : 1.2 Chunks Initialized : 100 >>> with zarr.config.set({'array.order': 'F'}): @@ -150,7 +150,7 @@ ratios, depending on the correlation structure within the data. E.g.:: Serializer : BytesCodec(endian=) Compressors : (ZstdCodec(level=0, checksum=False),) No. bytes : 400000000 (381.5M) - No. bytes stored : 342588717 + No. bytes stored : 342588911 Storage ratio : 1.2 Chunks Initialized : 100 diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 13a275a6a1..ab62508c80 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -7,6 +7,7 @@ from zarr.core.buffer.core import default_buffer_prototype if TYPE_CHECKING: + from collections.abc import Callable from typing import Self from zarr.core.buffer import Buffer, BufferPrototype @@ -143,9 +144,30 @@ def parse_storage_transformers(data: object) -> tuple[dict[str, JSON], ...]: class V3JsonEncoder(json.JSONEncoder): - def __init__(self, *args: Any, **kwargs: Any) -> None: - self.indent = kwargs.pop("indent", config.get("json_indent")) - super().__init__(*args, **kwargs) + def __init__( + self, + *, + skipkeys: bool = False, + ensure_ascii: bool = True, + check_circular: bool = True, + allow_nan: bool = True, + sort_keys: bool = False, + indent: int | None = None, + separators: tuple[str, str] | None = None, + default: Callable[[object], object] | None = None, + ) -> None: + if indent is None: + indent = config.get("json_indent") + super().__init__( + skipkeys=skipkeys, + ensure_ascii=ensure_ascii, + check_circular=check_circular, + allow_nan=allow_nan, + sort_keys=sort_keys, + indent=indent, + separators=separators, + default=default, + ) def default(self, o: object) -> Any: if isinstance(o, np.dtype): diff --git a/tests/test_array.py b/tests/test_array.py index 410b2e58d0..6600424147 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -399,13 +399,13 @@ async def test_chunks_initialized() -> None: def test_nbytes_stored() -> None: arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()]) result = arr.nbytes_stored() - assert result == 366 # the size of the metadata document. This is a fragile test. + assert result == 502 # the size of the metadata document. This is a fragile test. arr[:50] = 1 result = arr.nbytes_stored() - assert result == 566 # the size with 5 chunks filled. + assert result == 702 # the size with 5 chunks filled. arr[50:] = 2 result = arr.nbytes_stored() - assert result == 766 # the size with all chunks filled. + assert result == 902 # the size with all chunks filled. async def test_nbytes_stored_async() -> None: @@ -413,13 +413,13 @@ async def test_nbytes_stored_async() -> None: shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()] ) result = await arr.nbytes_stored() - assert result == 366 # the size of the metadata document. This is a fragile test. + assert result == 502 # the size of the metadata document. This is a fragile test. await arr.setitem(slice(50), 1) result = await arr.nbytes_stored() - assert result == 566 # the size with 5 chunks filled. + assert result == 702 # the size with 5 chunks filled. await arr.setitem(slice(50, 100), 2) result = await arr.nbytes_stored() - assert result == 766 # the size with all chunks filled. + assert result == 902 # the size with all chunks filled. def test_default_fill_values() -> None: @@ -537,7 +537,7 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, - _count_bytes_stored=373 if shards is None else 578, # the metadata? + _count_bytes_stored=521 if shards is None else 982, # the metadata? ) assert result == expected @@ -545,11 +545,11 @@ def test_info_complete(self, chunks: tuple[int, int], shards: tuple[int, int] | result = arr.info_complete() if shards is None: expected = dataclasses.replace( - expected, _count_chunks_initialized=4, _count_bytes_stored=501 + expected, _count_chunks_initialized=4, _count_bytes_stored=649 ) else: expected = dataclasses.replace( - expected, _count_chunks_initialized=1, _count_bytes_stored=774 + expected, _count_chunks_initialized=1, _count_bytes_stored=1178 ) assert result == expected @@ -624,7 +624,7 @@ async def test_info_complete_async( _serializer=BytesCodec(), _count_bytes=512, _count_chunks_initialized=0, - _count_bytes_stored=373 if shards is None else 578, # the metadata? + _count_bytes_stored=521 if shards is None else 982, # the metadata? ) assert result == expected @@ -632,13 +632,12 @@ async def test_info_complete_async( result = await arr.info_complete() if shards is None: expected = dataclasses.replace( - expected, _count_chunks_initialized=4, _count_bytes_stored=501 + expected, _count_chunks_initialized=4, _count_bytes_stored=553 ) else: expected = dataclasses.replace( - expected, _count_chunks_initialized=1, _count_bytes_stored=774 + expected, _count_chunks_initialized=1, _count_bytes_stored=1178 ) - assert result == expected @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_metadata/test_v3.py b/tests/test_metadata/test_v3.py index ef527f42ef..a47cbf43bb 100644 --- a/tests/test_metadata/test_v3.py +++ b/tests/test_metadata/test_v3.py @@ -10,7 +10,8 @@ from zarr.codecs.bytes import BytesCodec from zarr.core.buffer import default_buffer_prototype from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding, V2ChunkKeyEncoding -from zarr.core.group import parse_node_type +from zarr.core.config import config +from zarr.core.group import GroupMetadata, parse_node_type from zarr.core.metadata.v3 import ( ArrayV3Metadata, DataType, @@ -304,6 +305,14 @@ def test_metadata_to_dict( assert observed == expected +@pytest.mark.parametrize("indent", [2, 4, None]) +def test_json_indent(indent: int): + with config.set({"json_indent": indent}): + m = GroupMetadata() + d = m.to_buffer_dict(default_buffer_prototype())["zarr.json"].to_bytes() + assert d == json.dumps(json.loads(d), indent=indent).encode() + + # @pytest.mark.parametrize("fill_value", [-1, 0, 1, 2932897]) # @pytest.mark.parametrize("precision", ["ns", "D"]) # async def test_datetime_metadata(fill_value: int, precision: str) -> None: