From 6ce05265472771e922e69012105d2210e3405aa9 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Wed, 23 Oct 2024 08:37:55 -0700 Subject: [PATCH 1/5] [v3] Array.append (#2413) * feature(array): implement Array.append changes the Array.resize to be an inplace operation * better error message * no more warn * style: pre-commit fixes --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/zarr/core/array.py | 130 ++++++++++++++++++--- tests/test_array.py | 188 +++++++++++++++++++++++++++++++ tests/test_codecs/test_codecs.py | 3 +- 3 files changed, 302 insertions(+), 19 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index bdafa33f67..8c4d797e9a 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -2,7 +2,7 @@ import json from asyncio import gather -from dataclasses import dataclass, field, replace +from dataclasses import dataclass, field from itertools import starmap from logging import getLogger from typing import TYPE_CHECKING, Any, Generic, Literal, cast, overload @@ -1104,15 +1104,15 @@ async def setitem( ) return await self._set_selection(indexer, value, prototype=prototype) - async def resize(self, new_shape: ChunkCoords, delete_outside_chunks: bool = True) -> Self: + async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + new_shape = parse_shapelike(new_shape) assert len(new_shape) == len(self.metadata.shape) new_metadata = self.metadata.update_shape(new_shape) - # Remove all chunks outside of the new shape - old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) - if delete_outside_chunks: + # Remove all chunks outside of the new shape + old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) + new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) async def _delete_key(key: str) -> None: await (self.store_path / key).delete() @@ -1128,7 +1128,63 @@ async def _delete_key(key: str) -> None: # Write new metadata await self._save_metadata(new_metadata) - return replace(self, metadata=new_metadata) + + # Update metadata (in place) + object.__setattr__(self, "metadata", new_metadata) + + async def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + """ + # ensure data is array-like + if not hasattr(data, "shape"): + data = np.asanyarray(data) + + self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis) + data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) + if self_shape_preserved != data_shape_preserved: + raise ValueError( + f"shape of data to append is not compatible with the array. " + f"The shape of the data is ({data_shape_preserved})" + f"and the shape of the array is ({self_shape_preserved})." + "All dimensions must match except for the dimension being " + "appended." + ) + # remember old shape + old_shape = self.shape + + # determine new shape + new_shape = tuple( + self.shape[i] if i != axis else self.shape[i] + data.shape[i] + for i in range(len(self.shape)) + ) + + # resize + await self.resize(new_shape) + + # store data + append_selection = tuple( + slice(None) if i != axis else slice(old_shape[i], new_shape[i]) + for i in range(len(self.shape)) + ) + await self.setitem(append_selection, data) + + return new_shape async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: # metadata.attributes is "frozen" so we simply clear and update the dict @@ -1147,7 +1203,8 @@ async def info(self) -> None: raise NotImplementedError -@dataclass(frozen=True) +# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed +@dataclass(frozen=False) class Array: """Instantiate an array from an initialized store.""" @@ -1297,6 +1354,11 @@ def shape(self) -> ChunkCoords: """ return self._async_array.shape + @shape.setter + def shape(self, value: ChunkCoords) -> None: + """Sets the shape of the array by calling resize.""" + self.resize(value) + @property def chunks(self) -> ChunkCoords: """Returns a tuple of integers describing the length of each dimension of a chunk of the array. @@ -2754,18 +2816,18 @@ def blocks(self) -> BlockIndex: :func:`set_block_selection` for documentation and examples.""" return BlockIndex(self) - def resize(self, new_shape: ChunkCoords) -> Array: + def resize(self, new_shape: ShapeLike) -> None: """ Change the shape of the array by growing or shrinking one or more dimensions. - This method does not modify the original Array object. Instead, it returns a new Array - with the specified shape. + Parameters + ---------- + new_shape : tuple + New shape of the array. Notes ----- - When resizing an array, the data are not rearranged in any way. - If one or more dimensions are shrunk, any chunks falling outside the new array shape will be deleted from the underlying store. However, it is noteworthy that the chunks partially falling inside the new array @@ -2778,7 +2840,6 @@ def resize(self, new_shape: ChunkCoords) -> Array: >>> import zarr >>> z = zarr.zeros(shape=(10000, 10000), >>> chunk_shape=(1000, 1000), - >>> store=StorePath(MemoryStore(mode="w")), >>> dtype="i4",) >>> z.shape (10000, 10000) @@ -2791,10 +2852,43 @@ def resize(self, new_shape: ChunkCoords) -> Array: >>> z2.shape (50, 50) """ - resized = sync(self._async_array.resize(new_shape)) - # TODO: remove this cast when type inference improves - _resized = cast(AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], resized) - return type(self)(_resized) + sync(self._async_array.resize(new_shape)) + + def append(self, data: npt.ArrayLike, axis: int = 0) -> ChunkCoords: + """Append `data` to `axis`. + + Parameters + ---------- + data : array-like + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple + + Notes + ----- + The size of all dimensions other than `axis` must match between this + array and `data`. + + Examples + -------- + >>> import numpy as np + >>> import zarr + >>> a = np.arange(10000000, dtype='i4').reshape(10000, 1000) + >>> z = zarr.array(a, chunks=(1000, 100)) + >>> z.shape + (10000, 1000) + >>> z.append(a) + (20000, 1000) + >>> z.append(np.vstack([a, a]), axis=1) + (20000, 2000) + >>> z.shape + (20000, 2000) + """ + return sync(self._async_array.append(data, axis=axis)) def update_attributes(self, new_attributes: dict[str, JSON]) -> Array: # TODO: remove this cast when type inference improves diff --git a/tests/test_array.py b/tests/test_array.py index f182cb1a14..ae8e7f99c2 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -419,6 +419,194 @@ def test_update_attrs(zarr_format: int) -> None: assert arr2.attrs["foo"] == "bar" +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_resize_1d(store: MemoryStore, zarr_format: int) -> None: + z = zarr.create( + shape=105, chunks=10, dtype="i4", fill_value=0, store=store, zarr_format=zarr_format + ) + a = np.arange(105, dtype="i4") + z[:] = a + assert (105,) == z.shape + assert (105,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + z.resize(205) + assert (205,) == z.shape + assert (205,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:105]) + np.testing.assert_array_equal(np.zeros(100, dtype="i4"), z[105:]) + + z.resize(55) + assert (55,) == z.shape + assert (55,) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a[:55], z[:]) + + # via shape setter + new_shape = (105,) + z.shape = new_shape + assert new_shape == z.shape + assert new_shape == z[:].shape + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_resize_2d(store: MemoryStore, zarr_format: int) -> None: + z = zarr.create( + shape=(105, 105), + chunks=(10, 10), + dtype="i4", + fill_value=0, + store=store, + zarr_format=zarr_format, + ) + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z[:] = a + assert (105, 105) == z.shape + assert (105, 105) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + z.resize((205, 205)) + assert (205, 205) == z.shape + assert (205, 205) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:105, :105]) + np.testing.assert_array_equal(np.zeros((100, 205), dtype="i4"), z[105:, :]) + np.testing.assert_array_equal(np.zeros((205, 100), dtype="i4"), z[:, 105:]) + + z.resize((55, 55)) + assert (55, 55) == z.shape + assert (55, 55) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:55, :55], z[:]) + + z.resize((55, 1)) + assert (55, 1) == z.shape + assert (55, 1) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:55, :1], z[:]) + + z.resize((1, 55)) + assert (1, 55) == z.shape + assert (1, 55) == z[:].shape + assert np.dtype("i4") == z.dtype + assert np.dtype("i4") == z[:].dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a[:1, :10], z[:, :10]) + np.testing.assert_array_equal(np.zeros((1, 55 - 10), dtype="i4"), z[:, 10:55]) + + # via shape setter + new_shape = (105, 105) + z.shape = new_shape + assert new_shape == z.shape + assert new_shape == z[:].shape + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_1d(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105) + z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + b = np.arange(105, 205) + e = np.append(a, b) + assert z.shape == (105,) + z.append(b) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(e, z[:]) + + # check append handles array-like + c = [1, 2, 3] + f = np.append(e, c) + z.append(c) + assert f.shape == z.shape + assert f.dtype == z.dtype + assert (10,) == z.chunks + np.testing.assert_array_equal(f, z[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_2d(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z = zarr.create( + shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10, 10) == z.chunks + actual = z[:] + np.testing.assert_array_equal(a, actual) + + b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105)) + e = np.append(a, b, axis=0) + z.append(b) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10, 10) == z.chunks + actual = z[:] + np.testing.assert_array_equal(e, actual) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_2d_axis(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(105 * 105, dtype="i4").reshape((105, 105)) + z = zarr.create( + shape=a.shape, chunks=(10, 10), dtype=a.dtype, store=store, zarr_format=zarr_format + ) + z[:] = a + assert a.shape == z.shape + assert a.dtype == z.dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(a, z[:]) + + b = np.arange(105 * 105, 2 * 105 * 105, dtype="i4").reshape((105, 105)) + e = np.append(a, b, axis=1) + z.append(b, axis=1) + assert e.shape == z.shape + assert e.dtype == z.dtype + assert (10, 10) == z.chunks + np.testing.assert_array_equal(e, z[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("zarr_format", [2, 3]) +def test_append_bad_shape(store: MemoryStore, zarr_format: int) -> None: + a = np.arange(100) + z = zarr.create(shape=a.shape, chunks=10, dtype=a.dtype, store=store, zarr_format=zarr_format) + z[:] = a + b = a.reshape(10, 10) + with pytest.raises(ValueError): + z.append(b) + + @pytest.mark.parametrize("order", ["C", "F", None]) @pytest.mark.parametrize("zarr_format", [2, 3]) @pytest.mark.parametrize("store", ["memory"], indirect=True) diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py index 7a5fb979a1..0f2f892915 100644 --- a/tests/test_codecs/test_codecs.py +++ b/tests/test_codecs/test_codecs.py @@ -371,8 +371,9 @@ async def test_resize(store: Store) -> None: assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is not None - a = await a.resize((10, 12)) + await a.resize((10, 12)) assert a.metadata.shape == (10, 12) + assert a.shape == (10, 12) assert await store.get(f"{path}/0.0", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None From bc588a760a804f783c4242d4435863a43a5f3f9f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 23 Oct 2024 13:30:49 -0600 Subject: [PATCH 2/5] Fix JSON encoding of complex fill values (#2432) * Fix JSON encoding of complex fill values We were not replacing NaNs and Infs with the string versions. * Fix decoding of complex fill values * try excluding `math.inf` * Check complex numbers explicitly * Update src/zarr/core/metadata/v3.py --- src/zarr/core/metadata/v3.py | 26 ++++++++++++++++++++++---- tests/test_array.py | 23 +++++++++++++++++++++++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 6b6f28dd96..7a38e9fd70 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -43,6 +43,13 @@ DEFAULT_DTYPE = "float64" +# Keep in sync with _replace_special_floats +SPECIAL_FLOATS_ENCODED = { + "Infinity": np.inf, + "-Infinity": -np.inf, + "NaN": np.nan, +} + def parse_zarr_format(data: object) -> Literal[3]: if data == 3: @@ -149,7 +156,7 @@ def default(self, o: object) -> Any: if isinstance(out, complex): # python complex types are not JSON serializable, so we use the # serialization defined in the zarr v3 spec - return [out.real, out.imag] + return _replace_special_floats([out.real, out.imag]) elif np.isnan(out): return "NaN" elif np.isinf(out): @@ -447,8 +454,11 @@ def parse_fill_value( if isinstance(fill_value, Sequence) and not isinstance(fill_value, str): if data_type in (DataType.complex64, DataType.complex128): if len(fill_value) == 2: + decoded_fill_value = tuple( + SPECIAL_FLOATS_ENCODED.get(value, value) for value in fill_value + ) # complex datatypes serialize to JSON arrays with two elements - return np_dtype.type(complex(*fill_value)) + return np_dtype.type(complex(*decoded_fill_value)) else: msg = ( f"Got an invalid fill value for complex data type {data_type.value}." @@ -475,12 +485,20 @@ def parse_fill_value( pass elif fill_value in ["Infinity", "-Infinity"] and not np.isfinite(casted_value): pass - elif np_dtype.kind in "cf": + elif np_dtype.kind == "f": # float comparison is not exact, especially when dtype None: + store = MemoryStore({}, mode="w") + Array.create(store=store, shape=(1,), dtype=np.complex64, fill_value=fill_value) + content = await store.get("zarr.json", prototype=default_buffer_prototype()) + assert content is not None + actual = json.loads(content.to_bytes()) + assert actual["fill_value"] == expected From 87ca1508f8c362a91e29ad7aa0759ab26047c792 Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Thu, 24 Oct 2024 22:44:30 +0200 Subject: [PATCH 3/5] support zero-sized chunks (#2434) * support zero-sized chunks * fix imports * add min_side=0 to testing strategies * fix property tests --------- Co-authored-by: Deepak Cherian --- src/zarr/core/indexing.py | 4 +++- src/zarr/testing/strategies.py | 15 +++++++++++---- tests/test_indexing.py | 10 +++++++++- tests/test_properties.py | 6 ++++-- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/indexing.py b/src/zarr/core/indexing.py index f1d5fd16d1..1873d5c83c 100644 --- a/src/zarr/core/indexing.py +++ b/src/zarr/core/indexing.py @@ -94,6 +94,8 @@ def __iter__(self) -> Iterator[ChunkProjection]: ... def ceildiv(a: float, b: float) -> int: + if a == 0: + return 0 return math.ceil(a / b) @@ -374,7 +376,7 @@ def __init__(self, dim_sel: slice, dim_len: int, dim_chunk_len: int) -> None: def __iter__(self) -> Iterator[ChunkDimProjection]: # figure out the range of chunks we need to visit - dim_chunk_ix_from = self.start // self.dim_chunk_len + dim_chunk_ix_from = 0 if self.start == 0 else self.start // self.dim_chunk_len dim_chunk_ix_to = ceildiv(self.stop, self.dim_chunk_len) # iterate over chunks in range diff --git a/src/zarr/testing/strategies.py b/src/zarr/testing/strategies.py index 2c17fbf79d..c82e168cf1 100644 --- a/src/zarr/testing/strategies.py +++ b/src/zarr/testing/strategies.py @@ -65,7 +65,7 @@ def v2_dtypes() -> st.SearchStrategy[np.dtype]: stores = st.builds(MemoryStore, st.just({}), mode=st.just("w")) compressors = st.sampled_from([None, "default"]) zarr_formats: st.SearchStrategy[Literal[2, 3]] = st.sampled_from([2, 3]) -array_shapes = npst.array_shapes(max_dims=4) +array_shapes = npst.array_shapes(max_dims=4, min_side=0) @st.composite # type: ignore[misc] @@ -85,7 +85,7 @@ def numpy_arrays( @st.composite # type: ignore[misc] def np_array_and_chunks( draw: st.DrawFn, *, arrays: st.SearchStrategy[np.ndarray] = numpy_arrays -) -> tuple[np.ndarray, tuple[int]]: # type: ignore[type-arg] +) -> tuple[np.ndarray, tuple[int, ...]]: # type: ignore[type-arg] """A hypothesis strategy to generate small sized random arrays. Returns: a tuple of the array and a suitable random chunking for it. @@ -93,9 +93,16 @@ def np_array_and_chunks( array = draw(arrays) # We want this strategy to shrink towards arrays with smaller number of chunks # 1. st.integers() shrinks towards smaller values. So we use that to generate number of chunks - numchunks = draw(st.tuples(*[st.integers(min_value=1, max_value=size) for size in array.shape])) + numchunks = draw( + st.tuples( + *[st.integers(min_value=0 if size == 0 else 1, max_value=size) for size in array.shape] + ) + ) # 2. and now generate the chunks tuple - chunks = tuple(size // nchunks for size, nchunks in zip(array.shape, numchunks, strict=True)) + chunks = tuple( + size // nchunks if nchunks > 0 else 0 + for size, nchunks in zip(array.shape, numchunks, strict=True) + ) return (array, chunks) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index b3a1990686..2c51f3da3a 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -11,6 +11,7 @@ from numpy.testing import assert_array_equal import zarr +from zarr import Array from zarr.core.buffer import BufferPrototype, default_buffer_prototype from zarr.core.indexing import ( BasicSelection, @@ -31,7 +32,6 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator - from zarr.core.array import Array from zarr.core.buffer.core import Buffer from zarr.core.common import ChunkCoords @@ -1927,3 +1927,11 @@ def test_indexing_with_zarr_array(store: StorePath) -> None: assert_array_equal(a[ii], za[zii]) assert_array_equal(a[ii], za.oindex[zii]) + + +@pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) +@pytest.mark.parametrize("shape", [(0, 2, 3), (0), (3, 0)]) +def test_zero_sized_chunks(store: StorePath, shape: list[int]) -> None: + z = Array.create(store=store, shape=shape, chunk_shape=shape, zarr_format=3, dtype="f8") + z[...] = 42 + assert_array_equal(z[...], np.zeros(shape, dtype="f8")) diff --git a/tests/test_properties.py b/tests/test_properties.py index 380a4d851e..f70753ceb5 100644 --- a/tests/test_properties.py +++ b/tests/test_properties.py @@ -6,7 +6,7 @@ import hypothesis.extra.numpy as npst # noqa: E402 import hypothesis.strategies as st # noqa: E402 -from hypothesis import given # noqa: E402 +from hypothesis import assume, given # noqa: E402 from zarr.testing.strategies import arrays, basic_indices, numpy_arrays, zarr_formats # noqa: E402 @@ -35,11 +35,13 @@ def test_basic_indexing(data: st.DataObject) -> None: @given(data=st.data()) def test_vindex(data: st.DataObject) -> None: zarray = data.draw(arrays()) + # integer_array_indices can't handle 0-size dimensions. + assume(all(s > 0 for s in zarray.shape)) nparray = zarray[:] indexer = data.draw( npst.integer_array_indices( - shape=nparray.shape, result_shape=npst.array_shapes(max_dims=None) + shape=nparray.shape, result_shape=npst.array_shapes(min_side=1, max_dims=None) ) ) actual = zarray.vindex[indexer] From 109f71f6ddec95e2eba034646c582885ad2bb44b Mon Sep 17 00:00:00 2001 From: Norman Rzepka Date: Fri, 25 Oct 2024 00:47:57 +0200 Subject: [PATCH 4/5] Refactors v2 codec handling (#2425) * refactors codec pipeline for v2 * async * rm ensure_bytes --- src/zarr/codecs/_v2.py | 115 +++++++++++++++++------------------------ src/zarr/core/array.py | 7 ++- tests/test_v2.py | 9 ++++ 3 files changed, 60 insertions(+), 71 deletions(-) diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py index 0f50264be8..30504ad204 100644 --- a/src/zarr/codecs/_v2.py +++ b/src/zarr/codecs/_v2.py @@ -5,20 +5,21 @@ from typing import TYPE_CHECKING import numcodecs -from numcodecs.compat import ensure_bytes, ensure_ndarray +from numcodecs.compat import ensure_ndarray_like -from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec -from zarr.core.buffer import Buffer, NDBuffer, default_buffer_prototype +from zarr.abc.codec import ArrayBytesCodec from zarr.registry import get_ndbuffer_class if TYPE_CHECKING: import numcodecs.abc from zarr.core.array_spec import ArraySpec + from zarr.core.buffer import Buffer, NDBuffer @dataclass(frozen=True) -class V2Compressor(ArrayBytesCodec): +class V2Codec(ArrayBytesCodec): + filters: tuple[numcodecs.abc.Codec, ...] | None compressor: numcodecs.abc.Codec | None is_fixed_size = False @@ -28,81 +29,61 @@ async def _decode_single( chunk_bytes: Buffer, chunk_spec: ArraySpec, ) -> NDBuffer: - if self.compressor is not None: - chunk_numpy_array = ensure_ndarray( - await asyncio.to_thread(self.compressor.decode, chunk_bytes.as_array_like()) - ) + cdata = chunk_bytes.as_array_like() + # decompress + if self.compressor: + chunk = await asyncio.to_thread(self.compressor.decode, cdata) else: - chunk_numpy_array = ensure_ndarray(chunk_bytes.as_array_like()) + chunk = cdata + + # apply filters + if self.filters: + for f in reversed(self.filters): + chunk = await asyncio.to_thread(f.decode, chunk) + + # view as numpy array with correct dtype + chunk = ensure_ndarray_like(chunk) + # special case object dtype, because incorrect handling can lead to + # segfaults and other bad things happening + if chunk_spec.dtype != object: + chunk = chunk.view(chunk_spec.dtype) + elif chunk.dtype != object: + # If we end up here, someone must have hacked around with the filters. + # We cannot deal with object arrays unless there is an object + # codec in the filter chain, i.e., a filter that converts from object + # array to something else during encoding, and converts back to object + # array during decoding. + raise RuntimeError("cannot read object array without object codec") - # ensure correct dtype - if str(chunk_numpy_array.dtype) != chunk_spec.dtype and not chunk_spec.dtype.hasobject: - chunk_numpy_array = chunk_numpy_array.view(chunk_spec.dtype) + # ensure correct chunk shape + chunk = chunk.reshape(-1, order="A") + chunk = chunk.reshape(chunk_spec.shape, order=chunk_spec.order) - return get_ndbuffer_class().from_numpy_array(chunk_numpy_array) + return get_ndbuffer_class().from_ndarray_like(chunk) async def _encode_single( - self, - chunk_array: NDBuffer, - _chunk_spec: ArraySpec, - ) -> Buffer | None: - chunk_numpy_array = chunk_array.as_numpy_array() - if self.compressor is not None: - if ( - not chunk_numpy_array.flags.c_contiguous - and not chunk_numpy_array.flags.f_contiguous - ): - chunk_numpy_array = chunk_numpy_array.copy(order="A") - encoded_chunk_bytes = ensure_bytes( - await asyncio.to_thread(self.compressor.encode, chunk_numpy_array) - ) - else: - encoded_chunk_bytes = ensure_bytes(chunk_numpy_array) - - return default_buffer_prototype().buffer.from_bytes(encoded_chunk_bytes) - - def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: - raise NotImplementedError - - -@dataclass(frozen=True) -class V2Filters(ArrayArrayCodec): - filters: tuple[numcodecs.abc.Codec, ...] | None - - is_fixed_size = False - - async def _decode_single( self, chunk_array: NDBuffer, chunk_spec: ArraySpec, - ) -> NDBuffer: - chunk_ndarray = chunk_array.as_ndarray_like() - # apply filters in reverse order - if self.filters is not None: - for filter in self.filters[::-1]: - chunk_ndarray = await asyncio.to_thread(filter.decode, chunk_ndarray) - - # ensure correct chunk shape - if chunk_ndarray.shape != chunk_spec.shape: - chunk_ndarray = chunk_ndarray.reshape( - chunk_spec.shape, - order=chunk_spec.order, - ) + ) -> Buffer | None: + chunk = chunk_array.as_ndarray_like() - return get_ndbuffer_class().from_ndarray_like(chunk_ndarray) + # apply filters + if self.filters: + for f in self.filters: + chunk = await asyncio.to_thread(f.encode, chunk) - async def _encode_single( - self, - chunk_array: NDBuffer, - chunk_spec: ArraySpec, - ) -> NDBuffer | None: - chunk_ndarray = chunk_array.as_ndarray_like().ravel(order=chunk_spec.order) + # check object encoding + if ensure_ndarray_like(chunk).dtype == object: + raise RuntimeError("cannot write object array without object codec") - if self.filters is not None: - for filter in self.filters: - chunk_ndarray = await asyncio.to_thread(filter.encode, chunk_ndarray) + # compress + if self.compressor: + cdata = await asyncio.to_thread(self.compressor.encode, chunk) + else: + cdata = chunk - return get_ndbuffer_class().from_ndarray_like(chunk_ndarray) + return chunk_spec.prototype.buffer.from_bytes(cdata) def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int: raise NotImplementedError diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8c4d797e9a..a4b86b85ae 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -13,7 +13,7 @@ from zarr._compat import _deprecate_positional_args from zarr.abc.store import Store, set_or_delete from zarr.codecs import _get_default_array_bytes_codec -from zarr.codecs._v2 import V2Compressor, V2Filters +from zarr.codecs._v2 import V2Codec from zarr.core.attributes import Attributes from zarr.core.buffer import ( BufferPrototype, @@ -118,9 +118,8 @@ def create_codec_pipeline(metadata: ArrayMetadata) -> CodecPipeline: if isinstance(metadata, ArrayV3Metadata): return get_pipeline_class().from_codecs(metadata.codecs) elif isinstance(metadata, ArrayV2Metadata): - return get_pipeline_class().from_codecs( - [V2Filters(metadata.filters), V2Compressor(metadata.compressor)] - ) + v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor) + return get_pipeline_class().from_codecs([v2_codec]) else: raise TypeError diff --git a/tests/test_v2.py b/tests/test_v2.py index 439b15b64c..3dd17848fb 100644 --- a/tests/test_v2.py +++ b/tests/test_v2.py @@ -121,3 +121,12 @@ async def test_create_dtype_str(dtype: Any) -> None: arr[:] = ["a", "bb", "ccc"] result = arr[:] np.testing.assert_array_equal(result, np.array(["a", "bb", "ccc"], dtype="object")) + + +@pytest.mark.parametrize("filters", [[], [numcodecs.Delta(dtype=" None: + array_fixture = [42] + arr = zarr.create(shape=1, dtype=" Date: Thu, 24 Oct 2024 15:51:47 -0700 Subject: [PATCH 5/5] feature(group): add group setitem api (#2393) * feature(group): add group setitem api * arrays proxy * rollback to simple version * rollback deprecation * rollback ... * Update tests/test_group.py --- src/zarr/api/asynchronous.py | 8 ++++++-- src/zarr/core/group.py | 24 ++++++++++++++++++++++-- src/zarr/storage/zip.py | 5 ++++- tests/test_group.py | 21 +++++++++++++++++++-- 4 files changed, 51 insertions(+), 7 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 680433565e..cd8c3543ca 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -396,12 +396,16 @@ async def save_array( mode = kwargs.pop("mode", None) store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options) + if np.isscalar(arr): + arr = np.array(arr) + shape = arr.shape + chunks = getattr(arr, "chunks", None) # for array-likes with chunks attribute new = await AsyncArray.create( store_path, zarr_format=zarr_format, - shape=arr.shape, + shape=shape, dtype=arr.dtype, - chunks=arr.shape, + chunks=chunks, **kwargs, ) await new.setitem(slice(None), arr) diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 6e54b7ec9b..46f37700eb 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -600,6 +600,23 @@ def from_dict( store_path=store_path, ) + async def setitem(self, key: str, value: Any) -> None: + """Fastpath for creating a new array + + New arrays will be created with default array settings for the array type. + + Parameters + ---------- + key : str + Array name + value : array-like + Array data + """ + path = self.store_path / key + await async_api.save_array( + store=path, arr=value, zarr_format=self.metadata.zarr_format, exists_ok=True + ) + async def getitem( self, key: str, @@ -1394,8 +1411,11 @@ def __len__(self) -> int: return self.nmembers() def __setitem__(self, key: str, value: Any) -> None: - """__setitem__ is not supported in v3""" - raise NotImplementedError + """Fastpath for creating a new array. + + New arrays will be created using default settings for the array type. + """ + self._sync(self._async_group.setitem(key, value)) def __repr__(self) -> str: return f"" diff --git a/src/zarr/storage/zip.py b/src/zarr/storage/zip.py index 204a381bdb..d9e1aa1300 100644 --- a/src/zarr/storage/zip.py +++ b/src/zarr/storage/zip.py @@ -219,7 +219,10 @@ async def set_if_not_exists(self, key: str, value: Buffer) -> None: async def delete(self, key: str) -> None: # docstring inherited - raise NotImplementedError + # we choose to only raise NotImplementedError here if the key exists + # this allows the array/group APIs to avoid the overhead of existence checks + if await self.exists(key): + raise NotImplementedError async def exists(self, key: str) -> bool: # docstring inherited diff --git a/tests/test_group.py b/tests/test_group.py index 21e4ef4e50..bcdc6ff0da 100644 --- a/tests/test_group.py +++ b/tests/test_group.py @@ -427,8 +427,25 @@ def test_group_setitem(store: Store, zarr_format: ZarrFormat) -> None: Test the `Group.__setitem__` method. """ group = Group.from_store(store, zarr_format=zarr_format) - with pytest.raises(NotImplementedError): - group["key"] = 10 + arr = np.ones((2, 4)) + group["key"] = arr + assert list(group.array_keys()) == ["key"] + assert group["key"].shape == (2, 4) + np.testing.assert_array_equal(group["key"][:], arr) + + if store.supports_deletes: + key = "key" + else: + # overwriting with another array requires deletes + # for stores that don't support this, we just use a new key + key = "key2" + + # overwrite with another array + arr = np.zeros((3, 5)) + group[key] = arr + assert key in list(group.array_keys()) + assert group[key].shape == (3, 5) + np.testing.assert_array_equal(group[key], arr) def test_group_contains(store: Store, zarr_format: ZarrFormat) -> None: