From 98c3ba0021b5d2b5af1ad29c308e5b324ee7f0bc Mon Sep 17 00:00:00 2001
From: Joseph Hamman <joe@earthmover.io>
Date: Tue, 17 Dec 2024 17:29:17 -0800
Subject: [PATCH] put todo sections in their own page

---
 docs/user-guide/arrays.rst  | 610 ------------------------------------
 docs/user-guide/index.rst   |   1 +
 docs/user-guide/storage.rst |   2 +
 docs/user-guide/todo.rst    | 566 +++++++++++++++++++++++++++++++++
 4 files changed, 569 insertions(+), 610 deletions(-)
 create mode 100644 docs/user-guide/todo.rst
diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst
index f69a7ddc06..42c3b1287a 100644
--- a/docs/user-guide/arrays.rst
+++ b/docs/user-guide/arrays.rst
@@ -485,613 +485,3 @@ Any combination of integer and slice can be used for block indexing:
 .. ipython:: python
 
    z.blocks[2, 1:3]
-
-Indexing fields in structured arrays
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-All selection methods support a ``fields`` parameter which allows retrieving or
-replacing data for a specific field in an array with a structured dtype. E.g.:
-
-.. ipython:: python
-
-   a = np.array([(b'aaa', 1, 4.2), (b'bbb', 2, 8.4), (b'ccc', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])
-   None  # TODO: z = zarr.array(a)
-   None  # TODO: z['foo']
-   None  # TODO: z['baz']
-   None  # TODO: z.get_basic_selection(slice(0, 2), fields='bar')
-   None  # TODO: z.get_coordinate_selection([0, 2], fields=['foo', 'baz'])
-
-.. .. _tutorial_strings:
-
-.. String arrays
-.. -------------
-
-.. There are several options for storing arrays of strings.
-
-.. If your strings are all ASCII strings, and you know the maximum length of the string in
-.. your array, then you can use an array with a fixed-length bytes dtype. E.g.:
-
-..     >>> z = zarr.zeros(10, dtype='S6')
-..     >>> z
-..     <zarr.Array (10,) |S6>
-..     >>> z[0] = b'Hello'
-..     >>> z[1] = b'world!'
-..     >>> z[:]
-..     array([b'Hello', b'world!', b'', b'', b'', b'', b'', b'', b'', b''],
-..           dtype='|S6')
-
-.. A fixed-length unicode dtype is also available, e.g.:
-
-..     >>> greetings = ['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!',
-..     ...              'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!',
-..     ...              'こんにちは世界', '世界，你好！', 'Helló, világ!', 'Zdravo svete!',
-..     ...              'เฮลโลเวิลด์']
-..     >>> text_data = greetings * 10000
-..     >>> z = zarr.array(text_data, dtype='U20')
-..     >>> z
-..     <zarr.Array (120000,) <U20>
-..     >>> z[:]
-..     array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...,
-..            'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'],
-..           dtype='<U20')
-
-.. For variable-length strings, the ``object`` dtype can be used, but a codec must be
-.. provided to encode the data (see also :ref:`tutorial_objects` below). At the time of
-.. writing there are four codecs available that can encode variable length string
-.. objects: :class:`numcodecs.vlen.VLenUTF8`, :class:`numcodecs.json.JSON`,
-.. :class:`numcodecs.msgpacks.MsgPack`. and :class:`numcodecs.pickles.Pickle`.
-.. E.g. using ``VLenUTF8``:
-
-..     >>> import numcodecs
-..     >>> z = zarr.array(text_data, dtype=object, object_codec=numcodecs.VLenUTF8())
-..     >>> z
-..     <zarr.Array (120000,) object>
-..     >>> z.filters
-..     [VLenUTF8()]
-..     >>> z[:]
-..     array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...,
-..            'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object)
-
-.. As a convenience, ``dtype=str`` (or ``dtype=unicode`` on Python 2.7) can be used, which
-.. is a short-hand for ``dtype=object, object_codec=numcodecs.VLenUTF8()``, e.g.:
-
-..     >>> z = zarr.array(text_data, dtype=str)
-..     >>> z
-..     <zarr.Array (120000,) object>
-..     >>> z.filters
-..     [VLenUTF8()]
-..     >>> z[:]
-..     array(['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', ...,
-..            'Helló, világ!', 'Zdravo svete!', 'เฮลโลเวิลด์'], dtype=object)
-
-.. Variable-length byte strings are also supported via ``dtype=object``. Again an
-.. ``object_codec`` is required, which can be one of :class:`numcodecs.vlen.VLenBytes` or
-.. :class:`numcodecs.pickles.Pickle`. For convenience, ``dtype=bytes`` (or ``dtype=str`` on Python
-.. 2.7) can be used as a short-hand for ``dtype=object, object_codec=numcodecs.VLenBytes()``,
-.. e.g.:
-
-..     >>> bytes_data = [g.encode('utf-8') for g in greetings] * 10000
-..     >>> z = zarr.array(bytes_data, dtype=bytes)
-..     >>> z
-..     <zarr.Array (120000,) object>
-..     >>> z.filters
-..     [VLenBytes()]
-..     >>> z[:]
-..     array([b'\xc2\xa1Hola mundo!', b'Hej V\xc3\xa4rlden!', b'Servus Woid!',
-..            ..., b'Hell\xc3\xb3, vil\xc3\xa1g!', b'Zdravo svete!',
-..            b'\xe0\xb9\x80\xe0\xb8\xae\xe0\xb8\xa5\xe0\xb9\x82\xe0\xb8\xa5\xe0\xb9\x80\xe0\xb8\xa7\xe0\xb8\xb4\xe0\xb8\xa5\xe0\xb8\x94\xe0\xb9\x8c'], dtype=object)
-
-.. If you know ahead of time all the possible string values that can occur, you could
-.. also use the :class:`numcodecs.categorize.Categorize` codec to encode each unique string value as an
-.. integer. E.g.:
-
-.. .. ipython::
-
-..    In [1]: categorize = numcodecs.Categorize(greetings, dtype=object)
-
-..    In [1]: z = zarr.array(text_data, dtype=object, object_codec=categorize)
-
-..    In [1]: z
-
-..    In [1]: z.filters
-
-..    In [1]: z[:]
-
-.. .. _tutorial_objects:
-
-.. Object arrays
-.. -------------
-
-.. Zarr supports arrays with an "object" dtype. This allows arrays to contain any type of
-.. object, such as variable length unicode strings, or variable length arrays of numbers, or
-.. other possibilities. When creating an object array, a codec must be provided via the
-.. ``object_codec`` argument. This codec handles encoding (serialization) of Python objects.
-.. The best codec to use will depend on what type of objects are present in the array.
-
-.. At the time of writing there are three codecs available that can serve as a general
-.. purpose object codec and support encoding of a mixture of object types:
-.. :class:`numcodecs.json.JSON`, :class:`numcodecs.msgpacks.MsgPack`. and :class:`numcodecs.pickles.Pickle`.
-
-.. For example, using the JSON codec:
-
-.. .. ipython::
-
-..    In [1]: z = zarr.empty(5, dtype=object, object_codec=numcodecs.JSON())
-
-..    In [1]: z[0] = 42
-
-..    In [1]: z[1] = 'foo'
-
-..    In [1]: z[2] = ['bar', 'baz', 'qux']
-
-..    In [1]: z[3] = {'a': 1, 'b': 2.2}
-
-..    In [1]: z[:]
-
-.. Not all codecs support encoding of all object types. The
-.. :class:`numcodecs.pickles.Pickle` codec is the most flexible, supporting encoding any type
-.. of Python object. However, if you are sharing data with anyone other than yourself, then
-.. Pickle is not recommended as it is a potential security risk. This is because malicious
-.. code can be embedded within pickled data. The JSON and MsgPack codecs do not have any
-.. security issues and support encoding of unicode strings, lists and dictionaries.
-.. MsgPack is usually faster for both encoding and decoding.
-
-.. Ragged arrays
-.. ~~~~~~~~~~~~~
-
-.. If you need to store an array of arrays, where each member array can be of any length
-.. and stores the same primitive type (a.k.a. a ragged array), the
-.. :class:`numcodecs.vlen.VLenArray` codec can be used, e.g.:
-
-.. .. ipython::
-
-..    In [1]: z = zarr.empty(4, dtype=object, object_codec=numcodecs.VLenArray(int))
-
-..    In [1]: z
-
-..    In [1]: z.filters
-
-..    In [1]: z[0] = np.array([1, 3, 5])
-
-..    In [1]: z[1] = np.array([4])
-
-..    In [1]: z[2] = np.array([7, 9, 14])
-
-..    In [1]: z[:]
-
-.. As a convenience, ``dtype='array:T'`` can be used as a short-hand for
-.. ``dtype=object, object_codec=numcodecs.VLenArray('T')``, where 'T' can be any NumPy
-.. primitive dtype such as 'i4' or 'f8'. E.g.:
-
-.. .. ipython::
-
-..    In [1]: z = zarr.empty(4, dtype='array:i8')
-
-..    In [1]: z
-
-..    In [1]: z.filters
-
-..    In [1]: z[0] = np.array([1, 3, 5])
-
-..    In [1]: z[1] = np.array([4])
-
-..    In [1]: z[2] = np.array([7, 9, 14])
-
-..    In [1]: z[:]
-
-.. .. _tutorial_chunks:
-
-.. Chunk optimizations
-.. -------------------
-
-.. .. _tutorial_chunks_shape:
-
-.. Chunk size and shape
-.. ~~~~~~~~~~~~~~~~~~~~
-
-.. In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide
-.. better performance, at least when using the Blosc compression library.
-
-.. The optimal chunk shape will depend on how you want to access the data. E.g.,
-.. for a 2-dimensional array, if you only ever take slices along the first
-.. dimension, then chunk across the second dimension. If you know you want to chunk
-.. across an entire dimension you can use ``None`` or ``-1`` within the ``chunks``
-.. argument, e.g.:
-
-.. .. ipython::
-
-..    In [1]: z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4')
-..     >>> z1.chunks
-..     (100, 10000)
-
-.. Alternatively, if you only ever take slices along the second dimension, then
-.. chunk across the first dimension, e.g.:
-
-.. .. ipython::
-
-..    In [1]: z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4')
-
-..    In [1]: z2.chunks
-..     (10000, 100)
-
-.. If you require reasonable performance for both access patterns then you need to
-.. find a compromise, e.g.:
-
-.. .. ipython::
-
-..    In [1]: z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4')
-
-..    In [1]: z3.chunks
-..     (1000, 1000)
-
-.. If you are feeling lazy, you can let Zarr guess a chunk shape for your data by
-.. providing ``chunks=True``, although please note that the algorithm for guessing
-.. a chunk shape is based on simple heuristics and may be far from optimal. E.g.:
-
-.. .. ipython::
-
-..    In [1]: z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4')
-
-..    In [1]: z4.chunks
-..     (625, 625)
-
-.. If you know you are always going to be loading the entire array into memory, you
-.. can turn off chunks by providing ``chunks=False``, in which case there will be
-.. .. one single chunk for the array::
-
-.. .. ipython::
-
-..    In [1]: z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4')
-
-..    In [1]: z5.chunks
-..     (10000, 10000)
-
-.. .. _tutorial_chunks_order:
-
-.. Chunk memory layout
-.. ~~~~~~~~~~~~~~~~~~~
-
-.. The order of bytes **within each chunk** of an array can be changed via the
-.. ``order`` keyword argument, to use either C or Fortran layout. For
-.. multi-dimensional arrays, these two layouts may provide different compression
-.. ratios, depending on the correlation structure within the data. E.g.:
-
-.. .. ipython::
-
-..    In [1]: a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T
-
-..    In [1]: c = zarr.array(a, chunks=(1000, 1000))
-
-..    In [1]: c.info
-
-..    In [1]: f = zarr.array(a, chunks=(1000, 1000), order='F')
-
-..    In [1]: f.info
-
-.. In the above example, Fortran order gives a better compression ratio. This is an
-.. artificial example but illustrates the general point that changing the order of
-.. bytes within chunks of an array may improve the compression ratio, depending on
-.. the structure of the data, the compression algorithm used, and which compression
-.. filters (e.g., byte-shuffle) have been applied.
-
-.. .. _tutorial_chunks_empty_chunks:
-
-.. Empty chunks
-.. ~~~~~~~~~~~~
-
-.. As of version 2.11, it is possible to configure how Zarr handles the storage of
-.. chunks that are "empty" (i.e., every element in the chunk is equal to the array's fill value).
-.. When creating an array with ``write_empty_chunks=False``,
-.. Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty,
-.. then Zarr does not store it, and instead deletes the chunk from storage
-.. if the chunk had been previously stored.
-
-.. This optimization prevents storing redundant objects and can speed up reads, but the cost is
-.. added computation during array writes, since the contents of
-.. each chunk must be compared to the fill value, and these advantages are contingent on the content of the array.
-.. If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above.
-.. In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness.
-
-.. The following example illustrates the effect of the ``write_empty_chunks`` flag on
-.. the time required to write an array with different values.:
-
-.. .. ipython::
-
-..    In [1]: import zarr
-
-..    In [1]: import numpy as np
-
-..    In [1]: import time
-
-..    In [1]: from tempfile import TemporaryDirectory
-
-..    In [1]: def timed_write(write_empty_chunks):
-..                """
-..                Measure the time required and number of objects created when writing
-..                to a Zarr array with random ints or fill value.
-..                """
-..                chunks = (8192,)
-..                shape = (chunks[0] * 1024,)
-..                data = np.random.randint(0, 255, shape)
-..                dtype = 'uint8'
-
-..                with TemporaryDirectory() as store:
-..                    arr = zarr.open(store,
-..                                    shape=shape,
-..                                    chunks=chunks,
-..                                    dtype=dtype,
-..                                    write_empty_chunks=write_empty_chunks,
-..                                    fill_value=0,
-..                                    mode='w')
-..                    # initialize all chunks
-..                    arr[:] = 100
-..                    result = []
-..                    for value in (data, arr.fill_value):
-..                        start = time.time()
-..                        arr[:] = value
-..                        elapsed = time.time() - start
-..                        result.append((elapsed, arr.nchunks_initialized))
-
-..                    return result
-
-..    In [1]: for write_empty_chunks in (True, False):
-..                full, empty = timed_write(write_empty_chunks)
-..                print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n')
-
-.. In this example, writing random data is slightly slower with ``write_empty_chunks=True``,
-.. but writing empty data is substantially faster and generates far fewer objects in storage.
-
-.. .. _tutorial_rechunking:
-
-.. Changing chunk shapes (rechunking)
-.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. Sometimes you are not free to choose the initial chunking of your input data, or
-.. you might have data saved with chunking which is not optimal for the analysis you
-.. have planned. In such cases it can be advantageous to re-chunk the data. For small
-.. datasets, or when the mismatch between input and output chunks is small
-.. such that only a few chunks of the input dataset need to be read to create each
-.. chunk in the output array, it is sufficient to simply copy the data to a new array
-.. with the desired chunking, e.g.:
-
-.. .. ipython::
-
-..    In [1]: a = zarr.zeros((10000, 10000), chunks=(100,100), dtype='uint16', store='a.zarr')
-
-..    In [1]: b = zarr.array(a, chunks=(100, 200), store='b.zarr')
-
-.. If the chunk shapes mismatch, however, a simple copy can lead to non-optimal data
-.. access patterns and incur a substantial performance hit when using
-.. file based stores. One of the most pathological examples is
-.. switching from column-based chunking to row-based chunking e.g.:
-
-.. .. ipython::
-
-..    In [1]: a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr')
-
-..    In [1]: b = zarr.array(a, chunks=(1,10000), store='b.zarr')
-
-.. which will require every chunk in the input data set to be repeatedly read when creating
-.. each output chunk. If the entire array will fit within memory, this is simply resolved
-.. by forcing the entire input array into memory as a numpy array before converting
-.. back to zarr with the desired chunking.
-
-.. .. ipython::
-
-..    In [1]: a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr')
-
-..    In [1]: b = a[...]
-
-..    In [1]: c = zarr.array(b, chunks=(1,10000), store='c.zarr')
-
-.. For data sets which have mismatched chunks and which do not fit in memory, a
-.. more sophisticated approach to rechunking, such as offered by the
-.. `rechunker <https://github.com/pangeo-data/rechunker>`_ package and discussed
-.. `here <https://medium.com/pangeo/rechunker-the-missing-link-for-chunked-array-analytics-5b2359e9dc11>`_
-.. may offer a substantial improvement in performance.
-
-.. .. _tutorial_sync:
-
-.. Parallel computing and synchronization
-.. --------------------------------------
-
-.. Zarr arrays have been designed for use as the source or sink for data in
-.. parallel computations. By data source we mean that multiple concurrent read
-.. operations may occur. By data sink we mean that multiple concurrent write
-.. operations may occur, with each writer updating a different region of the
-.. array. Zarr arrays have **not** been designed for situations where multiple
-.. readers and writers are concurrently operating on the same array.
-
-.. Both multi-threaded and multi-process parallelism are possible. The bottleneck
-.. for most storage and retrieval operations is compression/decompression, and the
-.. Python global interpreter lock (GIL) is released wherever possible during these
-.. operations, so Zarr will generally not block other Python threads from running.
-
-.. When using a Zarr array as a data sink, some synchronization (locking) may be
-.. required to avoid data loss, depending on how data are being updated. If each
-.. worker in a parallel computation is writing to a separate region of the array,
-.. and if region boundaries are perfectly aligned with chunk boundaries, then no
-.. synchronization is required. However, if region and chunk boundaries are not
-.. perfectly aligned, then synchronization is required to avoid two workers
-.. attempting to modify the same chunk at the same time, which could result in data
-.. loss.
-
-.. To give a simple example, consider a 1-dimensional array of length 60, ``z``,
-.. divided into three chunks of 20 elements each. If three workers are running and
-.. each attempts to write to a 20 element region (i.e., ``z[0:20]``, ``z[20:40]``
-.. and ``z[40:60]``) then each worker will be writing to a separate chunk and no
-.. synchronization is required. However, if two workers are running and each
-.. attempts to write to a 30 element region (i.e., ``z[0:30]`` and ``z[30:60]``)
-.. then it is possible both workers will attempt to modify the middle chunk at the
-.. same time, and synchronization is required to prevent data loss.
-
-.. Zarr provides support for chunk-level synchronization. E.g., create an array
-.. with thread synchronization:
-
-.. .. ipython::
-
-..    In [1]: z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', synchronizer=zarr.ThreadSynchronizer())
-
-..    In [1]: z
-
-.. This array is safe to read or write within a multi-threaded program.
-
-.. Zarr also provides support for process synchronization via file locking,
-.. provided that all processes have access to a shared file system, and provided
-.. that the underlying file system supports file locking (which is not the case for
-.. some networked file systems). E.g.:
-
-.. .. ipython::
-
-..    In [1]: synchronizer = zarr.ProcessSynchronizer('data/example.sync')
-
-..    In [1]: z = zarr.open_array('data/example', mode='w', shape=(10000, 10000),
-..     ...                     chunks=(1000, 1000), dtype='i4',
-..     ...                     synchronizer=synchronizer)
-
-..    In [1]: z
-..     <zarr.Array (10000, 10000) int32>
-
-.. This array is safe to read or write from multiple processes.
-
-.. When using multiple processes to parallelize reads or writes on arrays using the Blosc
-.. compression library, it may be necessary to set ``numcodecs.blosc.use_threads = False``,
-.. as otherwise Blosc may share incorrect global state amongst processes causing programs
-.. to hang. See also the section on :ref:`tutorial_tips_blosc` below.
-
-.. Please note that support for parallel computing is an area of ongoing research
-.. and development. If you are using Zarr for parallel computing, we welcome
-.. feedback, experience, discussion, ideas and advice, particularly about issues
-.. related to data integrity and performance.
-
-.. .. _tutorial_pickle:
-
-.. Pickle support
-.. --------------
-
-.. Zarr arrays and groups can be pickled, as long as the underlying store object can be
-.. pickled. Instances of any of the storage classes provided in the :mod:`zarr.storage`
-.. module can be pickled, as can the built-in ``dict`` class which can also be used for
-.. storage.
-
-.. Note that if an array or group is backed by an in-memory store like a ``dict`` or
-.. :class:`zarr.storage.MemoryStore`, then when it is pickled all of the store data will be
-.. included in the pickled data. However, if an array or group is backed by a persistent
-.. store like a :class:`zarr.storage.DirectoryStore`, :class:`zarr.storage.ZipStore` or
-.. :class:`zarr.storage.DBMStore` then the store data **are not** pickled. The only thing
-.. that is pickled is the necessary parameters to allow the store to re-open any
-.. underlying files or databases upon being unpickled.
-
-.. E.g., pickle/unpickle an in-memory array:
-
-.. .. ipython::
-
-..    In [1]: import pickle
-
-..    In [1]: z1 = zarr.array(np.arange(100000))
-
-..    In [1]: s = pickle.dumps(z1)
-
-..    In [1]: len(s) > 5000  # relatively large because data have been pickled
-
-..    In [1]: z2 = pickle.loads(s)
-
-..    In [1]: z1 == z2
-
-..    In [1]: np.all(z1[:] == z2[:])
-
-.. E.g., pickle/unpickle an array stored on disk:
-
-.. .. ipython::
-
-..    In [1]: z3 = zarr.open('data/walnuts.zarr', mode='w', shape=100000, dtype='i8')
-
-..    In [1]: z3[:] = np.arange(100000)
-
-..    In [1]: s = pickle.dumps(z3)
-
-..    In [1]: len(s) < 200  # small because no data have been pickled
-
-..    In [1]: z4 = pickle.loads(s)
-
-..    In [1]: z3 == z4
-
-..    In [1]: np.all(z3[:] == z4[:])
-
-.. .. _tutorial_datetime:
-
-.. Datetimes and timedeltas
-.. ------------------------
-
-.. NumPy's ``datetime64`` ('M8') and ``timedelta64`` ('m8') dtypes are supported for Zarr
-.. arrays, as long as the units are specified. E.g.:
-
-.. .. ipython::
-
-..    In [1]: z = zarr.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='M8[D]')
-
-..    In [1]: z
-
-..    In [1]: z[:]
-
-..    In [1]: z[0]
-
-..    In [1]: z[0] = '1999-12-31'
-
-..    In [1]: z[:]
-
-.. .. _tutorial_tips:
-
-.. Usage tips
-.. ----------
-
-.. .. _tutorial_tips_copy:
-
-.. Copying large arrays
-.. ~~~~~~~~~~~~~~~~~~~~
-
-.. Data can be copied between large arrays without needing much memory, e.g.:
-
-.. .. ipython::
-
-..    In [1]: z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4')
-
-..    In [1]: z1[:] = 42
-
-..    In [1]: z2 = zarr.empty_like(z1)
-
-..    In [1]: z2[:] = z1
-
-.. Internally the example above works chunk-by-chunk, extracting only the data from
-.. ``z1`` required to fill each chunk in ``z2``. The source of the data (``z1``)
-.. could equally be an h5py Dataset.
-
-.. .. _tutorial_tips_blosc:
-
-.. Configuring Blosc
-.. ~~~~~~~~~~~~~~~~~
-
-.. The Blosc compressor is able to use multiple threads internally to accelerate
-.. compression and decompression. By default, Blosc uses up to 8
-.. internal threads. The number of Blosc threads can be changed to increase or
-.. decrease this number, e.g.:
-
-.. .. ipython::
-
-..    In [1]: from numcodecs import blosc
-
-..    In [1]: blosc.set_nthreads(2)  # doctest: +SKIP
-
-.. When a Zarr array is being used within a multi-threaded program, Zarr
-.. automatically switches to using Blosc in a single-threaded
-.. "contextual" mode. This is generally better as it allows multiple
-.. program threads to use Blosc simultaneously and prevents CPU thrashing
-.. from too many active threads. If you want to manually override this
-.. behaviour, set the value of the ``blosc.use_threads`` variable to
-.. ``True`` (Blosc always uses multiple internal threads) or ``False``
-.. (Blosc always runs in single-threaded contextual mode). To re-enable
-.. automatic switching, set ``blosc.use_threads`` to ``None``.
-
-.. Please note that if Zarr is being used within a multi-process program, Blosc may not
-.. be safe to use in multi-threaded mode and may cause the program to hang. If using Blosc
-.. in a multi-process program then it is recommended to set ``blosc.use_threads = False``.
diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst
index 4674c176b2..86229c3b00 100644
--- a/docs/user-guide/index.rst
+++ b/docs/user-guide/index.rst
@@ -11,6 +11,7 @@ User Guide
     groups
     storage
     whatsnew_v3
+    todo
 
 Advanced Topics
 ---------------
diff --git a/docs/user-guide/storage.rst b/docs/user-guide/storage.rst
index 8e7255a67c..09459064d4 100644
--- a/docs/user-guide/storage.rst
+++ b/docs/user-guide/storage.rst
@@ -1,3 +1,5 @@
+.. _tutorial_storage:
+
 Storage
 =======
 
diff --git a/docs/user-guide/todo.rst b/docs/user-guide/todo.rst
new file mode 100644
index 0000000000..65dfba23a8
--- /dev/null
+++ b/docs/user-guide/todo.rst
@@ -0,0 +1,566 @@
+TODO
+====
+
+The sections in this document require additional work.
+
+Indexing fields in structured arrays
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All selection methods support a ``fields`` parameter which allows retrieving or
+replacing data for a specific field in an array with a structured dtype. E.g.:
+
+.. ipython:: python
+
+   a = np.array([(b'aaa', 1, 4.2), (b'bbb', 2, 8.4), (b'ccc', 3, 12.6)], dtype=[('foo', 'S3'), ('bar', 'i4'), ('baz', 'f8')])
+   None  # TODO: z = zarr.array(a)
+   None  # TODO: z['foo']
+   None  # TODO: z['baz']
+   None  # TODO: z.get_basic_selection(slice(0, 2), fields='bar')
+   None  # TODO: z.get_coordinate_selection([0, 2], fields=['foo', 'baz'])
+
+.. _tutorial_strings:
+
+String arrays
+-------------
+
+There are several options for storing arrays of strings.
+
+If your strings are all ASCII strings, and you know the maximum length of the string in
+your array, then you can use an array with a fixed-length bytes dtype. E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.zeros(10, dtype='S6')
+..    z
+..    z[0] = b'Hello'
+..    z[1] = b'world!'
+..    z[:]
+
+A fixed-length unicode dtype is also available, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    greetings = ['¡Hola mundo!', 'Hej Världen!', 'Servus Woid!', 'Hei maailma!',
+..                 'Xin chào thế giới', 'Njatjeta Botë!', 'Γεια σου κόσμε!',
+..                 'こんにちは世界', '世界，你好！', 'Helló, világ!', 'Zdravo svete!',
+..                 'เฮลโลเวิลด์']
+..    text_data = greetings * 10000
+..    z = zarr.array(text_data, dtype='U20')
+..    z
+..    z[:]
+
+For variable-length strings, the ``object`` dtype can be used, but a codec must be
+provided to encode the data (see also :ref:`tutorial_objects` below). At the time of
+writing there are four codecs available that can encode variable length string
+objects: :class:`numcodecs.vlen.VLenUTF8`, :class:`numcodecs.json.JSON`,
+:class:`numcodecs.msgpacks.MsgPack`. and :class:`numcodecs.pickles.Pickle`.
+E.g. using ``VLenUTF8``:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    import numcodecs
+..    z = zarr.array(text_data, dtype=object, object_codec=numcodecs.VLenUTF8())
+..    z
+..    z.filters
+..    z[:]
+
+As a convenience, ``dtype=str`` (or ``dtype=unicode`` on Python 2.7) can be used, which
+is a short-hand for ``dtype=object, object_codec=numcodecs.VLenUTF8()``, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.array(text_data, dtype=str)
+..    z
+..    z.filters
+..    z[:]
+
+Variable-length byte strings are also supported via ``dtype=object``. Again an
+``object_codec`` is required, which can be one of :class:`numcodecs.vlen.VLenBytes` or
+:class:`numcodecs.pickles.Pickle`. For convenience, ``dtype=bytes`` (or ``dtype=str`` on Python
+2.7) can be used as a short-hand for ``dtype=object, object_codec=numcodecs.VLenBytes()``,
+e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    bytes_data = [g.encode('utf-8') for g in greetings] * 10000
+..    z = zarr.array(bytes_data, dtype=bytes)
+..    z
+..    z.filters
+..    z[:]
+
+If you know ahead of time all the possible string values that can occur, you could
+also use the :class:`numcodecs.categorize.Categorize` codec to encode each unique string value as an
+integer. E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    categorize = numcodecs.Categorize(greetings, dtype=object)
+..    z = zarr.array(text_data, dtype=object, object_codec=categorize)
+..    z
+..    z.filters
+..    z[:]
+
+.. _tutorial_objects:
+
+Object arrays
+-------------
+
+Zarr supports arrays with an "object" dtype. This allows arrays to contain any type of
+object, such as variable length unicode strings, or variable length arrays of numbers, or
+other possibilities. When creating an object array, a codec must be provided via the
+``object_codec`` argument. This codec handles encoding (serialization) of Python objects.
+The best codec to use will depend on what type of objects are present in the array.
+
+At the time of writing there are three codecs available that can serve as a general
+purpose object codec and support encoding of a mixture of object types:
+:class:`numcodecs.json.JSON`, :class:`numcodecs.msgpacks.MsgPack`. and :class:`numcodecs.pickles.Pickle`.
+
+For example, using the JSON codec:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.empty(5, dtype=object, object_codec=numcodecs.JSON())
+..    z[0] = 42
+..    z[1] = 'foo'
+..    z[2] = ['bar', 'baz', 'qux']
+..    z[3] = {'a': 1, 'b': 2.2}
+..    z[:]
+
+Not all codecs support encoding of all object types. The
+:class:`numcodecs.pickles.Pickle` codec is the most flexible, supporting encoding any type
+of Python object. However, if you are sharing data with anyone other than yourself, then
+Pickle is not recommended as it is a potential security risk. This is because malicious
+code can be embedded within pickled data. The JSON and MsgPack codecs do not have any
+security issues and support encoding of unicode strings, lists and dictionaries.
+MsgPack is usually faster for both encoding and decoding.
+
+Ragged arrays
+~~~~~~~~~~~~~
+
+If you need to store an array of arrays, where each member array can be of any length
+and stores the same primitive type (a.k.a. a ragged array), the
+:class:`numcodecs.vlen.VLenArray` codec can be used, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.empty(4, dtype=object, object_codec=numcodecs.VLenArray(int))
+..    z
+..    z.filters
+..    z[0] = np.array([1, 3, 5])
+..    z[1] = np.array([4])
+..    z[2] = np.array([7, 9, 14])
+..    z[:]
+
+As a convenience, ``dtype='array:T'`` can be used as a short-hand for
+``dtype=object, object_codec=numcodecs.VLenArray('T')``, where 'T' can be any NumPy
+primitive dtype such as 'i4' or 'f8'. E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.empty(4, dtype='array:i8')
+..    z
+..    z.filters
+..    z[0] = np.array([1, 3, 5])
+..    z[1] = np.array([4])
+..    z[2] = np.array([7, 9, 14])
+..    z[:]
+
+.. _tutorial_chunks:
+
+Chunk optimizations
+-------------------
+
+.. _tutorial_chunks_shape:
+
+Chunk size and shape
+~~~~~~~~~~~~~~~~~~~~
+
+In general, chunks of at least 1 megabyte (1M) uncompressed size seem to provide
+better performance, at least when using the Blosc compression library.
+
+The optimal chunk shape will depend on how you want to access the data. E.g.,
+for a 2-dimensional array, if you only ever take slices along the first
+dimension, then chunk across the second dimension. If you know you want to chunk
+across an entire dimension you can use ``None`` or ``-1`` within the ``chunks``
+argument, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z1 = zarr.zeros((10000, 10000), chunks=(100, None), dtype='i4')
+..    z1.chunks
+
+Alternatively, if you only ever take slices along the second dimension, then
+chunk across the first dimension, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z2 = zarr.zeros((10000, 10000), chunks=(None, 100), dtype='i4')
+..    z2.chunks
+
+If you require reasonable performance for both access patterns then you need to
+find a compromise, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z3 = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4')
+..    z3.chunks
+
+If you are feeling lazy, you can let Zarr guess a chunk shape for your data by
+providing ``chunks=True``, although please note that the algorithm for guessing
+a chunk shape is based on simple heuristics and may be far from optimal. E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z4 = zarr.zeros((10000, 10000), chunks=True, dtype='i4')
+..    z4.chunks
+
+If you know you are always going to be loading the entire array into memory, you
+can turn off chunks by providing ``chunks=False``, in which case there will be
+one single chunk for the array:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z5 = zarr.zeros((10000, 10000), chunks=False, dtype='i4')
+..    z5.chunks
+
+.. _tutorial_chunks_order:
+
+Chunk memory layout
+~~~~~~~~~~~~~~~~~~~
+
+The order of bytes **within each chunk** of an array can be changed via the
+``order`` keyword argument, to use either C or Fortran layout. For
+multi-dimensional arrays, these two layouts may provide different compression
+ratios, depending on the correlation structure within the data. E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    a = np.arange(100000000, dtype='i4').reshape(10000, 10000).T
+..    c = zarr.array(a, chunks=(1000, 1000))
+..    c.info
+..    f = zarr.array(a, chunks=(1000, 1000), order='F')
+..    f.info
+
+In the above example, Fortran order gives a better compression ratio. This is an
+artificial example but illustrates the general point that changing the order of
+bytes within chunks of an array may improve the compression ratio, depending on
+the structure of the data, the compression algorithm used, and which compression
+filters (e.g., byte-shuffle) have been applied.
+
+.. _tutorial_chunks_empty_chunks:
+
+Empty chunks
+~~~~~~~~~~~~
+
+As of version 2.11, it is possible to configure how Zarr handles the storage of
+chunks that are "empty" (i.e., every element in the chunk is equal to the array's fill value).
+When creating an array with ``write_empty_chunks=False``,
+Zarr will check whether a chunk is empty before compression and storage. If a chunk is empty,
+then Zarr does not store it, and instead deletes the chunk from storage
+if the chunk had been previously stored.
+
+This optimization prevents storing redundant objects and can speed up reads, but the cost is
+added computation during array writes, since the contents of
+each chunk must be compared to the fill value, and these advantages are contingent on the content of the array.
+If you know that your data will form chunks that are almost always non-empty, then there is no advantage to the optimization described above.
+In this case, creating an array with ``write_empty_chunks=True`` (the default) will instruct Zarr to write every chunk without checking for emptiness.
+
+The following example illustrates the effect of the ``write_empty_chunks`` flag on
+the time required to write an array with different values.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    import zarr
+..    import numpy as np
+..    import time
+..    from tempfile import TemporaryDirectory
+
+
+..    def timed_write(write_empty_chunks):
+..       """
+..       Measure the time required and number of objects created when writing
+..       to a Zarr array with random ints or fill value.
+..       """
+..       chunks = (8192,)
+..       shape = (chunks[0] * 1024,)
+..       data = np.random.randint(0, 255, shape)
+..       dtype = 'uint8'
+
+..       with TemporaryDirectory() as store:
+..             arr = zarr.open(store,
+..                            shape=shape,
+..                            chunks=chunks,
+..                            dtype=dtype,
+..                            write_empty_chunks=write_empty_chunks,
+..                            fill_value=0,
+..                            mode='w')
+..             # initialize all chunks
+..             arr[:] = 100
+..             result = []
+..             for value in (data, arr.fill_value):
+..                start = time.time()
+..                arr[:] = value
+..                elapsed = time.time() - start
+..                result.append((elapsed, arr.nchunks_initialized))
+
+..             return result
+
+..    for write_empty_chunks in (True, False):
+..         full, empty = timed_write(write_empty_chunks)
+..         print(f'\nwrite_empty_chunks={write_empty_chunks}:\n\tRandom Data: {full[0]:.4f}s, {full[1]} objects stored\n\t Empty Data: {empty[0]:.4f}s, {empty[1]} objects stored\n')
+
+In this example, writing random data is slightly slower with ``write_empty_chunks=True``,
+but writing empty data is substantially faster and generates far fewer objects in storage.
+
+.. _tutorial_rechunking:
+
+Changing chunk shapes (rechunking)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sometimes you are not free to choose the initial chunking of your input data, or
+you might have data saved with chunking which is not optimal for the analysis you
+have planned. In such cases it can be advantageous to re-chunk the data. For small
+datasets, or when the mismatch between input and output chunks is small
+such that only a few chunks of the input dataset need to be read to create each
+chunk in the output array, it is sufficient to simply copy the data to a new array
+with the desired chunking, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    a = zarr.zeros((10000, 10000), chunks=(100,100), dtype='uint16', store='a.zarr')
+..    b = zarr.array(a, chunks=(100, 200), store='b.zarr')
+
+If the chunk shapes mismatch, however, a simple copy can lead to non-optimal data
+access patterns and incur a substantial performance hit when using
+file based stores. One of the most pathological examples is
+switching from column-based chunking to row-based chunking e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr')
+..    b = zarr.array(a, chunks=(1,10000), store='b.zarr')
+
+which will require every chunk in the input data set to be repeatedly read when creating
+each output chunk. If the entire array will fit within memory, this is simply resolved
+by forcing the entire input array into memory as a numpy array before converting
+back to zarr with the desired chunking.
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    a = zarr.zeros((10000,10000), chunks=(10000, 1), dtype='uint16', store='a.zarr')
+..    b = a[...]
+..    c = zarr.array(b, chunks=(1,10000), store='c.zarr')
+
+For data sets which have mismatched chunks and which do not fit in memory, a
+more sophisticated approach to rechunking, such as offered by the
+`rechunker <https://github.com/pangeo-data/rechunker>`_ package and discussed
+`here <https://medium.com/pangeo/rechunker-the-missing-link-for-chunked-array-analytics-5b2359e9dc11>`_
+may offer a substantial improvement in performance.
+
+.. _tutorial_sync:
+
+Parallel computing and synchronization
+--------------------------------------
+
+Zarr arrays have been designed for use as the source or sink for data in
+parallel computations. By data source we mean that multiple concurrent read
+operations may occur. By data sink we mean that multiple concurrent write
+operations may occur, with each writer updating a different region of the
+array. Zarr arrays have **not** been designed for situations where multiple
+readers and writers are concurrently operating on the same array.
+
+Both multi-threaded and multi-process parallelism are possible. The bottleneck
+for most storage and retrieval operations is compression/decompression, and the
+Python global interpreter lock (GIL) is released wherever possible during these
+operations, so Zarr will generally not block other Python threads from running.
+
+When using a Zarr array as a data sink, some synchronization (locking) may be
+required to avoid data loss, depending on how data are being updated. If each
+worker in a parallel computation is writing to a separate region of the array,
+and if region boundaries are perfectly aligned with chunk boundaries, then no
+synchronization is required. However, if region and chunk boundaries are not
+perfectly aligned, then synchronization is required to avoid two workers
+attempting to modify the same chunk at the same time, which could result in data
+loss.
+
+To give a simple example, consider a 1-dimensional array of length 60, ``z``,
+divided into three chunks of 20 elements each. If three workers are running and
+each attempts to write to a 20 element region (i.e., ``z[0:20]``, ``z[20:40]``
+and ``z[40:60]``) then each worker will be writing to a separate chunk and no
+synchronization is required. However, if two workers are running and each
+attempts to write to a 30 element region (i.e., ``z[0:30]`` and ``z[30:60]``)
+then it is possible both workers will attempt to modify the middle chunk at the
+same time, and synchronization is required to prevent data loss.
+
+Zarr provides support for chunk-level synchronization. E.g., create an array
+with thread synchronization:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.zeros((10000, 10000), chunks=(1000, 1000), dtype='i4', synchronizer=zarr.ThreadSynchronizer())
+..    z
+
+This array is safe to read or write within a multi-threaded program.
+
+Zarr also provides support for process synchronization via file locking,
+provided that all processes have access to a shared file system, and provided
+that the underlying file system supports file locking (which is not the case for
+some networked file systems). E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    synchronizer = zarr.ProcessSynchronizer('data/example.sync')
+
+..    z = zarr.open_array('data/example', mode='w', shape=(10000, 10000), chunks=(1000, 1000), dtype='i4', synchronizer=synchronizer)
+..    z
+
+This array is safe to read or write from multiple processes.
+
+When using multiple processes to parallelize reads or writes on arrays using the Blosc
+compression library, it may be necessary to set ``numcodecs.blosc.use_threads = False``,
+as otherwise Blosc may share incorrect global state amongst processes causing programs
+to hang. See also the section on :ref:`tutorial_tips_blosc` below.
+
+Please note that support for parallel computing is an area of ongoing research
+and development. If you are using Zarr for parallel computing, we welcome
+feedback, experience, discussion, ideas and advice, particularly about issues
+related to data integrity and performance.
+
+.. _tutorial_pickle:
+
+Pickle support
+--------------
+
+Zarr arrays and groups can be pickled, as long as the underlying store object can be
+pickled. Instances of any of the storage classes provided in the :mod:`zarr.storage`
+module can be pickled, as can the built-in ``dict`` class which can also be used for
+storage.
+
+Note that if an array or group is backed by an in-memory store like a ``dict`` or
+:class:`zarr.storage.MemoryStore`, then when it is pickled all of the store data will be
+included in the pickled data. However, if an array or group is backed by a persistent
+store like a :class:`zarr.storage.DirectoryStore`, :class:`zarr.storage.ZipStore` or
+:class:`zarr.storage.DBMStore` then the store data **are not** pickled. The only thing
+that is pickled is the necessary parameters to allow the store to re-open any
+underlying files or databases upon being unpickled.
+
+E.g., pickle/unpickle an in-memory array:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    import pickle
+
+..    z1 = zarr.array(np.arange(100000))
+..    s = pickle.dumps(z1)
+..    len(s) > 5000  # relatively large because data have been pickled
+..    z2 = pickle.loads(s)
+..    z1 == z2
+..    np.all(z1[:] == z2[:])
+
+E.g., pickle/unpickle an array stored on disk:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z3 = zarr.open('data/walnuts.zarr', mode='w', shape=100000, dtype='i8')
+..    z3[:] = np.arange(100000)
+..    s = pickle.dumps(z3)
+..    len(s) < 200  # small because no data have been pickled
+..    z4 = pickle.loads(s)
+..    z3 == z4
+..    np.all(z3[:] == z4[:])
+
+.. _tutorial_datetime:
+
+Datetimes and timedeltas
+------------------------
+
+NumPy's ``datetime64`` ('M8') and ``timedelta64`` ('m8') dtypes are supported for Zarr
+arrays, as long as the units are specified. E.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z = zarr.array(['2007-07-13', '2006-01-13', '2010-08-13'], dtype='M8[D]')
+..    z
+..    z[:]
+..    z[0]
+..    z[0] = '1999-12-31'
+..    z[:]
+
+.. _tutorial_tips:
+
+Usage tips
+----------
+
+.. _tutorial_tips_copy:
+
+Copying large arrays
+~~~~~~~~~~~~~~~~~~~~
+
+Data can be copied between large arrays without needing much memory, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    z1 = zarr.empty((10000, 10000), chunks=(1000, 1000), dtype='i4')
+..    z1[:] = 42
+..    z2 = zarr.empty_like(z1)
+..    z2[:] = z1
+
+Internally the example above works chunk-by-chunk, extracting only the data from
+``z1`` required to fill each chunk in ``z2``. The source of the data (``z1``)
+could equally be an h5py Dataset.
+
+.. _tutorial_tips_blosc:
+
+Configuring Blosc
+~~~~~~~~~~~~~~~~~
+
+The Blosc compressor is able to use multiple threads internally to accelerate
+compression and decompression. By default, Blosc uses up to 8
+internal threads. The number of Blosc threads can be changed to increase or
+decrease this number, e.g.:
+
+.. .. ipython:: python
+..    :verbatim:
+
+..    from numcodecs import blosc
+
+..    blosc.set_nthreads(2)  # doctest: +SKIP
+
+When a Zarr array is being used within a multi-threaded program, Zarr
+automatically switches to using Blosc in a single-threaded
+"contextual" mode. This is generally better as it allows multiple
+program threads to use Blosc simultaneously and prevents CPU thrashing
+from too many active threads. If you want to manually override this
+behaviour, set the value of the ``blosc.use_threads`` variable to
+``True`` (Blosc always uses multiple internal threads) or ``False``
+(Blosc always runs in single-threaded contextual mode). To re-enable
+automatic switching, set ``blosc.use_threads`` to ``None``.
+
+Please note that if Zarr is being used within a multi-process program, Blosc may not
+be safe to use in multi-threaded mode and may cause the program to hang. If using Blosc
+in a multi-process program then it is recommended to set ``blosc.use_threads = False``.