diff --git a/CHANGELOG.md b/CHANGELOG.md index 2ddcdda7b..c36666c7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ ### Documentation and tutorial enhancements - Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) - Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825) +- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834) ## PyNWB 2.5.0 (August 18, 2023) diff --git a/docs/gallery/advanced_io/plot_zarr_io.py b/docs/gallery/advanced_io/plot_zarr_io.py new file mode 100644 index 000000000..b61fe4a03 --- /dev/null +++ b/docs/gallery/advanced_io/plot_zarr_io.py @@ -0,0 +1,98 @@ +""" +Zarr IO +======= + +Zarr is an alternative backend option for NWB files. It is a Python package that +provides an implementation of chunked, compressed, N-dimensional arrays. Zarr is a good +option for large datasets because, like HDF5, it is designed to store data on disk and +only load the data into memory when needed. Zarr is also a good option for parallel +computing because it supports concurrent reads and writes. + +Note that the Zarr native storage formats are optimized for storage in cloud storage +(e.g., S3). For very large files, Zarr will create many files which can lead to +issues for traditional file system (that are not cloud object stores) due to limitations +on the number of files per directory (this affects local disk, GDrive, Dropbox etc.). + +Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr<>` package. First, create an +an NWBFile using PyNWB. +""" + +# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png' + + +from datetime import datetime +from dateutil.tz import tzlocal + +import numpy as np +from pynwb import NWBFile, TimeSeries + +# Create the NWBFile. Substitute your NWBFile generation here. +nwbfile = NWBFile( + session_description="my first synthetic recording", + identifier="EXAMPLE_ID", + session_start_time=datetime.now(tzlocal()), + session_id="LONELYMTN", +) + +####################################################################################### +# Dataset Configuration +# --------------------- +# Like HDF5, Zarr provides options to chunk and compress datasets. To leverage these +# features, replace all :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` with the analogous +# :py:class:`~hdmf_zarr.utils.ZarrDataIO`, which takes compressors specified by the +# :py:mod:`numcodecs` library. For example, here is an example :py:class:`.TimeSeries` +# where the ``data`` Dataset is compressed with a Blosc-zstd compressor: + +from numcodecs import Blosc +from hdmf_zarr import ZarrDataIO + +data_with_zarr_data_io = ZarrDataIO( + data=np.random.randn(100, 100), + chunks=(10, 10), + fillvalue=0, + compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.SHUFFLE) +) + +####################################################################################### +# Now add it to the :py:class:`.NWBFile`. + +nwbfile.add_acquisition( + TimeSeries( + name="synthetic_timeseries", + data=data_with_zarr_data_io, + unit="m", + rate=10e3, + ) +) + +####################################################################################### +# Writing to Zarr +# --------------- +# To write NWB files to Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with +# :py:class:`hdmf_zarr.nwb.NWBZarrIO`. + +from hdmf_zarr.nwb import NWBZarrIO +import os + +path = "zarr_tutorial.nwb.zarr" +absolute_path = os.path.abspath(path) +with NWBZarrIO(path=path, mode="w") as io: + io.write(nwbfile) + +####################################################################################### +# .. note:: +# The main reason for using the ``absolute_path`` here is for testing purposes to +# ensure links and references work as expected. Otherwise, using the relative path +# here instead is fine. +# +# Reading from Zarr +# ----------------- +# To read NWB files from Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with the analogous +# :py:class:`hdmf_zarr.nwb.NWBZarrIO`. + +with NWBZarrIO(path=absolute_path, mode="r") as io: + read_nwbfile = io.read() + +####################################################################################### +# .. note:: +# For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`. diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 31ce6793c..760e2da71 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -169,6 +169,6 @@ # 1. supports caching, which will dramatically speed up repeated requests for the # same region of data, # 2. automatically retries when s3 fails to return, which helps avoid errors when accessing data due to -# intermittent errors in connections with S3 (remfile does this as well), +# intermittent errors in connections with S3 (remfile does this as well), # 3. works also with other storage backends (e.g., GoogleDrive or Dropbox, not just S3) and file formats, and # 4. in our experience appears to provide faster out-of-the-box performance than the ros3 driver. diff --git a/docs/source/conf.py b/docs/source/conf.py index 143d9d2c6..5725bd816 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -148,6 +148,8 @@ def __call__(self, filename): 'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None), 'nwbwidgets': ("https://nwb-widgets.readthedocs.io/en/latest/", None), 'nwb-overview': ("https://nwb-overview.readthedocs.io/en/latest/", None), + 'hdmf-zarr': ("https://hdmf-zarr.readthedocs.io/en/latest/", None), + 'numcodecs': ("https://numcodecs.readthedocs.io/en/latest/", None), } extlinks = { @@ -159,6 +161,7 @@ def __call__(self, filename): 'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', '%s'), 'dandi': ('https://www.dandiarchive.org/%s', '%s'), "nwbinspector": ("https://nwbinspector.readthedocs.io/en/dev/%s", "%s"), + 'hdmf-zarr': ('https://hdmf-zarr.readthedocs.io/en/latest/%s', '%s'), } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png b/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png new file mode 100644 index 000000000..8926a47ff Binary files /dev/null and b/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png differ diff --git a/requirements-doc.txt b/requirements-doc.txt index 2050f4439..c37aee646 100644 --- a/requirements-doc.txt +++ b/requirements-doc.txt @@ -12,3 +12,4 @@ dataframe_image # used to render large dataframe as image in the sphinx galler lxml # used by dataframe_image when using the matplotlib backend hdf5plugin dandi>=0.46.6 +hdmf-zarr