diff --git a/CHANGELOG.md b/CHANGELOG.md index e2161a28b..2ddcdda7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,8 @@ - Fix bug where pynwb version was reported as "unknown" to readthedocs @stephprince [#1810](https://github.com/NeurodataWithoutBorders/pynwb/pull/1810) ### Documentation and tutorial enhancements -- Add RemFile to streaming tutorial @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) +- Add RemFile to streaming tutorial. @bendichter [#1761](https://github.com/NeurodataWithoutBorders/pynwb/pull/1761) +- Fix typos and improve clarify throughout tutorials. @zm711 [#1825](https://github.com/NeurodataWithoutBorders/pynwb/pull/1825) ## PyNWB 2.5.0 (August 18, 2023) diff --git a/docs/gallery/advanced_io/linking_data.py b/docs/gallery/advanced_io/linking_data.py index 82824f6cd..2f79d1488 100644 --- a/docs/gallery/advanced_io/linking_data.py +++ b/docs/gallery/advanced_io/linking_data.py @@ -221,7 +221,7 @@ # Step 2: Add the container to another NWBFile # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # To integrate both :py:meth:`~pynwb.base.TimeSeries` into a single file we simply create a new -# :py:meth:`~pynwb.file.NWBFile` and our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's +# :py:meth:`~pynwb.file.NWBFile` and add our existing :py:meth:`~pynwb.base.TimeSeries` to it. PyNWB's # :py:class:`~pynwb.NWBHDF5IO` backend then automatically detects that the TimeSeries have already # been written to another file and will create external links for us. # diff --git a/docs/gallery/advanced_io/plot_iterative_write.py b/docs/gallery/advanced_io/plot_iterative_write.py index c461cddf8..958981a0b 100644 --- a/docs/gallery/advanced_io/plot_iterative_write.py +++ b/docs/gallery/advanced_io/plot_iterative_write.py @@ -17,7 +17,7 @@ # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # In the typical write process, datasets are created and written as a whole. In contrast, -# iterative data write refers to the writing of the content of a dataset in an incremental, +# iterative data write refers to the writing of the contents of a dataset in an incremental, # iterative fashion. #################### @@ -32,10 +32,10 @@ # to avoid this problem by writing the data one-subblock-at-a-time, so that we only need to hold # a small subset of the array in memory at any given time. # * **Data streaming** In the context of streaming data we are faced with several issues: -# **1)** data is not available in memory but arrives in subblocks as the stream progresses +# **1)** data is not available in-memory but arrives in subblocks as the stream progresses # **2)** caching the data of a stream in-memory is often prohibitively expensive and volatile # **3)** the total size of the data is often unknown ahead of time. -# Iterative data write allows us to address issues 1) and 2) by enabling us to save data to +# Iterative data write allows us to address issues 1) and 2) by enabling us to save data to a # file incrementally as it arrives from the data stream. Issue 3) is addressed in the HDF5 # storage backend via support for chunking, enabling the creation of resizable arrays. # @@ -44,7 +44,7 @@ # data source. # # * **Sparse data arrays** In order to reduce storage size of sparse arrays a challenge is that while -# the data array (e.g., a matrix) may be large, only few values are set. To avoid storage overhead +# the data array (e.g., a matrix) may be large, only a few values are set. To avoid storage overhead # for storing the full array we can employ (in HDF5) a combination of chunking, compression, and # and iterative data write to significantly reduce storage cost for sparse data. # @@ -161,7 +161,7 @@ def write_test_file(filename, data, close_io=True): # # Here we use a simple data generator but PyNWB does not make any assumptions about what happens # inside the generator. Instead of creating data programmatically, you may hence, e.g., receive -# data from an acquisition system (or other source). We can, hence, use the same approach to write streaming data. +# data from an acquisition system (or other source). We can use the same approach to write streaming data. #################### # Step 1: Define the data generator @@ -208,7 +208,7 @@ def iter_sin(chunk_length=10, max_chunks=100): #################### # Discussion # ^^^^^^^^^^ -# Note, we here actually do not know how long our timeseries will be. +# Note, here we don't actually know how long our timeseries will be. print( "maxshape=%s, recommended_data_shape=%s, dtype=%s" @@ -218,7 +218,7 @@ def iter_sin(chunk_length=10, max_chunks=100): #################### # As we can see :py:class:`~hdmf.data_utils.DataChunkIterator` automatically recommends # in its ``maxshape`` that the first dimensions of our array should be unlimited (``None``) and the second -# dimension be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator` +# dimension should be ``10`` (i.e., the length of our chunk. Since :py:class:`~hdmf.data_utils.DataChunkIterator` # has no way of knowing the minimum size of the array it automatically recommends the size of the first # chunk as the minimum size (i.e, ``(1, 10)``) and also infers the data type automatically from the first chunk. # To further customize this behavior we may also define the ``maxshape``, ``dtype``, and ``buffer_size`` when @@ -227,8 +227,8 @@ def iter_sin(chunk_length=10, max_chunks=100): # .. tip:: # # We here used :py:class:`~hdmf.data_utils.DataChunkIterator` to conveniently wrap our data stream. -# :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generators yields in **consecutive order** -# **single** complete element along the **first dimension** of our a array (i.e., iterate over the first +# :py:class:`~hdmf.data_utils.DataChunkIterator` assumes that our generator yields in **consecutive order** +# a **single** complete element along the **first dimension** of our array (i.e., iterate over the first # axis and yield one-element-at-a-time). This behavior is useful in many practical cases. However, if # this strategy does not match our needs, then using :py:class:`~hdmf.data_utils.GenericDataChunkIterator` # or implementing your own derived :py:class:`~hdmf.data_utils.AbstractDataChunkIterator` may be more @@ -266,7 +266,7 @@ def __next__(self): """ Return in each iteration a fully occupied data chunk of self.chunk_shape values at a random location within the matrix. Chunks are non-overlapping. REMEMBER: h5py does not support all - fancy indexing that numpy does so we need to make sure our selection can be + the fancy indexing that numpy does so we need to make sure our selection can be handled by the backend. """ if self.__chunks_created < self.num_chunks: @@ -289,7 +289,7 @@ def __next__(self): next = __next__ def recommended_chunk_shape(self): - # Here we can optionally recommend what a good chunking should be. + # Here we can optionally recommend what a good chunking could be. return self.chunk_shape def recommended_data_shape(self): @@ -379,7 +379,7 @@ def maxshape(self): # Now lets check out the size of our data file and compare it against the expected full size of our matrix import os -expected_size = xsize * ysize * 8 # This is the full size of our matrix in byte +expected_size = xsize * ysize * 8 # This is the full size of our matrix in bytes occupied_size = num_values * 8 # Number of non-zero values in out matrix file_size = os.stat( "basic_sparse_iterwrite_example.nwb" @@ -420,14 +420,14 @@ def maxshape(self): # A slight overhead (here 0.08MB) is expected because our file contains also the additional objects from # the NWBFile, plus some overhead for managing all the HDF5 metadata for all objects. # * **3) vs 2):** Adding compression does not yield any improvement here. This is expected, because, again we -# selected the chunking here in a way that we already allocated the minimum amount of storage to represent our data +# selected the chunking here in a way that we already allocated the minimum amount of storage to represent our data # and lossless compression of random data is not efficient. # * **4) vs 2):** When we increase our chunk size to ``(100,100)`` (i.e., ``100x`` larger than the chunks produced by -# our matrix generator) we observe an according roughly ``100x`` increase in file size. This is expected +# our matrix generator) we observe an accordingly roughly ``100x`` increase in file size. This is expected # since our chunks now do not align perfectly with the occupied data and each occupied chunk is allocated fully. # * **5) vs 4):** When using compression for the larger chunks we see a significant reduction # in file size (``1.14MB`` vs. ``80MB``). This is because the allocated chunks now contain in addition to the random -# values large areas of constant fillvalues, which compress easily. +# values large areas of constant fill values, which compress easily. # # **Advantages:** # @@ -435,12 +435,12 @@ def maxshape(self): # * Only the data chunks in the HDF5 file that contain non-default values are ever being allocated # * The overall size of our file is reduced significantly # * Reduced I/O load -# * On read users can use the array as usual +# * On read, users can use the array as usual # # .. tip:: # -# With great power comes great responsibility **!** I/O and storage cost will depend among others on the chunk size, -# compression options, and the write pattern, i.e., the number and structure of the +# With great power comes great responsibility **!** I/O and storage cost will depend, among other factors, +# on the chunk size, compression options, and the write pattern, i.e., the number and structure of the # :py:class:`~hdmf.data_utils.DataChunk` objects written. For example, using ``(1,1)`` chunks and writing them # one value at a time would result in poor I/O performance in most practical cases, because of the large number of # chunks and large number of small I/O operations required. @@ -471,7 +471,7 @@ def maxshape(self): # # When converting large data files, a typical problem is that it is often too expensive to load all the data # into memory. This example is very similar to the data generator example only that instead of generating -# data on-the-fly in memory we are loading data from a file one-chunk-at-a-time in our generator. +# data on-the-fly in-memory we are loading data from a file one-chunk-at-a-time in our generator. # #################### @@ -568,7 +568,7 @@ def iter_largearray(filename, shape, dtype="float64"): # In practice, data from recording devices may be distributed across many files, e.g., one file per time range # or one file per recording channel. Using iterative data write provides an elegant solution to this problem # as it allows us to process large arrays one-subarray-at-a-time. To make things more interesting we'll show -# this for the case where each recording channel (i.e, the second dimension of our ``TimeSeries``) is broken up +# this for the case where each recording channel (i.e., the second dimension of our ``TimeSeries``) is broken up # across files. #################### diff --git a/docs/gallery/advanced_io/streaming.py b/docs/gallery/advanced_io/streaming.py index 064a37d09..760e2da71 100644 --- a/docs/gallery/advanced_io/streaming.py +++ b/docs/gallery/advanced_io/streaming.py @@ -131,9 +131,11 @@ ################################################## # Method 3: remfile # ----------------- -# ``remfile`` is another library that enables indexing and streaming of files in s3. remfile is simple, fast, and -# allows for caching of data in the local filesystem. The caveats of ``remfile`` are that it is a very new project -# that has not been tested in a variety of use-cases and caching options are limited compared to ``fsspec``. +# ``remfile`` is another library that enables indexing and streaming of files in s3. remfile is simple and fast, +# especially for the initial load of the nwb file and for accessing small pieces of data. The caveats of ``remfile`` +# are that it is a very new project that has not been tested in a variety of use-cases and caching options are +# limited compared to ``fsspec``. `remfile` is a simple, lightweight dependency with a very small codebase. +# # You can install ``remfile`` with pip: # # .. code-block:: bash @@ -159,11 +161,14 @@ # From a user perspective, once opened, the :py:class:`~pynwb.file.NWBFile` works the same with # fsspec, ros3, or remfile. However, in general, we currently recommend using fsspec for streaming # NWB files because it is more performant and reliable than ros3 and more widely tested than remfile. -# In particular, fsspec: +# However, if you are experiencing long wait times for the initial file load on your network, you +# may want to try remfile. +# +# Advantages of fsspec include: # # 1. supports caching, which will dramatically speed up repeated requests for the # same region of data, # 2. automatically retries when s3 fails to return, which helps avoid errors when accessing data due to -# intermittent errors in connections with S3, +# intermittent errors in connections with S3 (remfile does this as well), # 3. works also with other storage backends (e.g., GoogleDrive or Dropbox, not just S3) and file formats, and # 4. in our experience appears to provide faster out-of-the-box performance than the ros3 driver. diff --git a/docs/gallery/general/add_remove_containers.py b/docs/gallery/general/add_remove_containers.py index 26708f639..80c5cb032 100644 --- a/docs/gallery/general/add_remove_containers.py +++ b/docs/gallery/general/add_remove_containers.py @@ -77,7 +77,7 @@ # modifies the data on disk # (the :py:meth:`NWBHDF5IO.write ` method does not need to be called and the # :py:class:`~pynwb.NWBHDF5IO` instance does not need to be closed). Directly modifying datasets in this way -# can lead to files that do not validate or cannot be opened, so take caution when using this method. +# can lead to files that do not validate or cannot be opened, so exercise caution when using this method. # Note: only chunked datasets or datasets with ``maxshape`` set can be resized. # See the `h5py chunked storage documentation `_ # for more details. diff --git a/docs/gallery/general/extensions.py b/docs/gallery/general/extensions.py index 66645a660..4ec8f4749 100644 --- a/docs/gallery/general/extensions.py +++ b/docs/gallery/general/extensions.py @@ -100,7 +100,7 @@ # Using extensions # ----------------------------------------------------- # -# After an extension has been created, it can be used by downstream codes for reading and writing data. +# After an extension has been created, it can be used by downstream code for reading and writing data. # There are two main mechanisms for reading and writing extension data with PyNWB. # The first involves defining new :py:class:`~pynwb.core.NWBContainer` classes that are then mapped # to the neurodata types in the extension. @@ -167,7 +167,7 @@ def __init__(self, **kwargs): # By default, extensions are cached to file so that your NWB file will carry the extensions needed to read the file # with it. # -# To demonstrate this, first we will make some fake data using our extensions. +# To demonstrate this, first we will make some simulated data using our extensions. from datetime import datetime @@ -370,17 +370,17 @@ class PotatoSack(MultiContainerInterface): nwb = io.read() print(nwb.get_processing_module()["potato_sack"].get_potato("big_potato").weight) # note: you can call get_processing_module() with or without the module name as -# an argument. however, if there is more than one module, the name is required. -# here, there is more than one potato, so the name of the potato is required as -# an argument to get get_potato +# an argument. However, if there is more than one module, the name is required. +# Here, there is more than one potato, so the name of the potato is required as +# an argument to get_potato #################### # Example: Cortical Surface Mesh # ----------------------------------------------------- # # Here we show how to create extensions by creating a data class for a -# cortical surface mesh. This data type is particularly important for ECoG data, we need to know where each electrode is -# with respect to the gyri and sulci. Surface mesh objects contain two types of data: +# cortical surface mesh. This data type is particularly important for ECoG data, since we need to know where +# each electrode is with respect to the gyri and sulci. Surface mesh objects contain two types of data: # # 1. `vertices`, which is an (n, 3) matrix of floats that represents points in 3D space # diff --git a/docs/gallery/general/object_id.py b/docs/gallery/general/object_id.py index 481cbb36a..206142715 100644 --- a/docs/gallery/general/object_id.py +++ b/docs/gallery/general/object_id.py @@ -32,7 +32,7 @@ session_start_time=start_time, ) -# make some fake data +# make some simulated data timestamps = np.linspace(0, 100, 1024) data = ( np.sin(0.333 * timestamps)