Merge branch 'dev' into tmc

NeurodataWithoutBorders · Jan 31, 2024 · 9fdd1c3 · 9fdd1c3
2 parents 5bd2a25 + c40df49
commit 9fdd1c3
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@
 ### Documentation and tutorial enhancements
 - Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761)
 - Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825)
+- Add Zarr IO tutorial @bendichter [#1834](https://github.com/NeurodataWithoutBorders/pynwb/pull/1834)
 
 ## PyNWB 2.5.0 (August 18, 2023)
 

diff --git a/docs/gallery/advanced_io/plot_zarr_io.py b/docs/gallery/advanced_io/plot_zarr_io.py
@@ -0,0 +1,98 @@
+"""
+Zarr IO
+=======
+
+Zarr is an alternative backend option for NWB files. It is a Python package that
+provides an implementation of chunked, compressed, N-dimensional arrays. Zarr is a good
+option for large datasets because, like HDF5, it is designed to store data on disk and
+only load the data into memory when needed. Zarr is also a good option for parallel
+computing because it supports concurrent reads and writes.
+
+Note that the Zarr native storage formats are optimized for storage in cloud storage
+(e.g., S3). For very large files, Zarr will create many files which can lead to
+issues for traditional file system (that are not cloud object stores) due to limitations
+on the number of files per directory (this affects local disk, GDrive, Dropbox etc.).
+
+Zarr read and write is provided by the :hdmf-zarr:`hdmf-zarr<>` package. First, create an
+an NWBFile using PyNWB.
+"""
+
+# sphinx_gallery_thumbnail_path = 'figures/gallery_thumbnail_plot_nwbzarrio.png'
+
+
+from datetime import datetime
+from dateutil.tz import tzlocal
+
+import numpy as np
+from pynwb import NWBFile, TimeSeries
+
+# Create the NWBFile. Substitute your NWBFile generation here.
+nwbfile = NWBFile(
+    session_description="my first synthetic recording",
+    identifier="EXAMPLE_ID",
+    session_start_time=datetime.now(tzlocal()),
+    session_id="LONELYMTN",
+)
+
+#######################################################################################
+# Dataset Configuration
+# ---------------------
+# Like HDF5, Zarr provides options to chunk and compress datasets. To leverage these
+# features, replace all :py:class:`~hdmf.backends.hdf5.h5_utils.H5DataIO` with the analogous
+# :py:class:`~hdmf_zarr.utils.ZarrDataIO`, which takes compressors specified by the
+# :py:mod:`numcodecs` library. For example, here is an example :py:class:`.TimeSeries`
+# where the ``data`` Dataset is compressed with a Blosc-zstd compressor:
+
+from numcodecs import Blosc
+from hdmf_zarr import ZarrDataIO
+
+data_with_zarr_data_io = ZarrDataIO(
+    data=np.random.randn(100, 100),
+    chunks=(10, 10),
+    fillvalue=0,
+    compressor=Blosc(cname='zstd', clevel=3, shuffle=Blosc.SHUFFLE)
+)
+
+#######################################################################################
+# Now add it to the :py:class:`.NWBFile`.
+
+nwbfile.add_acquisition(
+    TimeSeries(
+        name="synthetic_timeseries",
+        data=data_with_zarr_data_io,
+        unit="m",
+        rate=10e3,
+    )
+)
+
+#######################################################################################
+# Writing to Zarr
+# ---------------
+# To write NWB files to Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with
+# :py:class:`hdmf_zarr.nwb.NWBZarrIO`.
+
+from hdmf_zarr.nwb import NWBZarrIO
+import os
+
+path = "zarr_tutorial.nwb.zarr"
+absolute_path = os.path.abspath(path)
+with NWBZarrIO(path=path, mode="w") as io:
+    io.write(nwbfile)
+
+#######################################################################################
+# .. note::
+#   The main reason for using the ``absolute_path`` here is for testing purposes to
+#   ensure links and references work as expected. Otherwise, using the relative path
+#   here instead is fine.
+#
+# Reading from Zarr
+# -----------------
+# To read NWB files from Zarr, replace the :py:class:`~pynwb.NWBHDF5IO` with the analogous
+# :py:class:`hdmf_zarr.nwb.NWBZarrIO`.
+
+with NWBZarrIO(path=absolute_path, mode="r") as io:
+    read_nwbfile = io.read()
+
+#######################################################################################
+# .. note::
+#    For more information, see the :hdmf-zarr:`hdmf-zarr documentation<>`.
diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py
@@ -169,6 +169,6 @@
 # 1. supports caching, which will dramatically speed up repeated requests for the
 #    same region of data,
 # 2. automatically retries when s3 fails to return, which helps avoid errors when accessing data due to
-#     intermittent errors in connections with S3 (remfile does this as well),
+#    intermittent errors in connections with S3 (remfile does this as well),
 # 3. works also with other storage backends (e.g., GoogleDrive or Dropbox, not just S3) and file formats, and
 # 4. in our experience appears to provide faster out-of-the-box performance than the ros3 driver.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -148,6 +148,8 @@ def __call__(self, filename):
     'fsspec': ("https://filesystem-spec.readthedocs.io/en/latest/", None),
     'nwbwidgets': ("https://nwb-widgets.readthedocs.io/en/latest/", None),
     'nwb-overview': ("https://nwb-overview.readthedocs.io/en/latest/", None),
+    'hdmf-zarr': ("https://hdmf-zarr.readthedocs.io/en/latest/", None),
+    'numcodecs': ("https://numcodecs.readthedocs.io/en/latest/", None),
 }
 
 extlinks = {
@@ -159,6 +161,7 @@ def __call__(self, filename):
     'hdmf-docs': ('https://hdmf.readthedocs.io/en/stable/%s', '%s'),
     'dandi': ('https://www.dandiarchive.org/%s', '%s'),
     "nwbinspector": ("https://nwbinspector.readthedocs.io/en/dev/%s", "%s"),
+    'hdmf-zarr': ('https://hdmf-zarr.readthedocs.io/en/latest/%s', '%s'),
 }
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png b/docs/source/figures/gallery_thumbnail_plot_nwbzarrio.png
diff --git a/requirements-doc.txt b/requirements-doc.txt
@@ -12,3 +12,4 @@ dataframe_image   # used to render large dataframe as image in the sphinx galler
 lxml  # used by dataframe_image when using the matplotlib backend
 hdf5plugin
 dandi>=0.46.6
+hdmf-zarr