Skip to content

Commit

Permalink
Final tweaks, bug fixes, and performance checks for this stage of Upr…
Browse files Browse the repository at this point in the history
…oot-writing (#428)

* The next version will be 4.1.1.

* Fixed bug in writing slices from a single jagged array.

* Counter branch should be unsigned int32 (convention).

* Only write the parts of the basket-lookup arrays that change.

* Rule of thumb in the docs: 100 kB per branch per 'extend'.
  • Loading branch information
jpivarski authored Aug 30, 2021
1 parent 42d078b commit 5ad18cf
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 26 deletions.
6 changes: 2 additions & 4 deletions docs-sphinx/basic.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1242,11 +1242,9 @@ The :ref:`uproot.writing.writable.WritableTree.extend` method always adds one TB
The arrays also have to have the same lengths as each other, though only in the first dimension. Above, the ``"x"`` NumPy array has shape ``(5, 3)``: the first dimension has length 5. The ``"y"`` Awkward array has type ``5 * var * float64``: the first dimension has length 5. This is why they are compatible; the inner dimensions don't matter (except inasmuch as they have the right *type*).
**As a word of warning,** be sure to make these extensions as large as is feasible within memory constraints. A ROOT file full of small TBaskets is both bloated (larger than it needs to be) and slow to read (especially for Uproot, but also for ROOT).
.. warning::
For instance, if you want to write a million events and have enough memory available to do that 100 thousand events at a time (i.e. a total of 10 TBaskets), then do so. Filling the TTree a hundred events at a time (i.e. a total of 10000 TBaskets) would be considerably slower for writing and reading, and the file would be much larger than it could otherwise be, even with compression.
The absolute worst case is one-entry-per-:ref:`uproot.writing.writable.WritableTree.extend`. You have been warned!
**As a word of warning,** be sure that each call to :ref:`uproot.writing.writable.WritableTree.extend` includes at least 100 kB per branch/array. (NumPy and Awkward Arrays have an `nbytes <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.nbytes.html>`__ property; you want at least ``100000`` per array.) If you ask Uproot to write very small TBaskets, such as the examples with length ``5`` above, it will spend more time working on TBasket overhead than actually writing data. The absolute worst case is one-entry-per-:ref:`uproot.writing.writable.WritableTree.extend`. See `#428 (comment) <https://github.com/scikit-hep/uproot4/pull/428#issuecomment-908703486>`__.
Specifying the compression
--------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/uproot/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import re

__version__ = "4.1.0"
__version__ = "4.1.1"
version = __version__
version_info = tuple(re.split(r"[-\.]", __version__))

Expand Down
53 changes: 41 additions & 12 deletions src/uproot/writing/_cascadetree.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,8 @@ def _branch_np(
"fBasketSeek": numpy.zeros(
self._basket_capacity, uproot.models.TBranch._tbranch13_dtype3
),
"arrays_write_start": 0,
"arrays_write_stop": 0,
"metadata_start": None,
"basket_metadata_start": None,
"tleaf_reference_number": None,
Expand Down Expand Up @@ -507,7 +509,11 @@ def extend(self, file, sink, data):

if isinstance(data, awkward.Array):
if data.ndim > 1 and not data.layout.purelist_isregular:
provided = {self._counter_name(""): awkward.num(data, axis=1)}
provided = {
self._counter_name(""): numpy.asarray(
awkward.num(data, axis=1), dtype=">u4"
)
}
else:
provided = {}
for k, v in zip(awkward.fields(data), awkward.unzip(data)):
Expand Down Expand Up @@ -535,7 +541,9 @@ def extend(self, file, sink, data):
and v.ndim > 1
and not v.layout.purelist_isregular
):
provided[self._counter_name(k)] = awkward.num(v, axis=1)
provided[self._counter_name(k)] = numpy.asarray(
awkward.num(v, axis=1), dtype=">u4"
)

provided[k] = v

Expand Down Expand Up @@ -668,9 +676,10 @@ def extend(self, file, sink, data):

content = layout.content
offsets = numpy.asarray(layout.offsets)

if offsets[0] != 0:
content = content[offsets[0] :]
offsets -= offsets[0]
offsets = offsets - offsets[0]
if len(content) > offsets[-1]:
content = content[: offsets[-1]]

Expand Down Expand Up @@ -752,6 +761,8 @@ def extend(self, file, sink, data):

datum["fBasketSeek"][self._num_baskets] = location

datum["arrays_write_stop"] = self._num_baskets + 1

# update TTree metadata in file
self._num_entries += num_entries
self._num_baskets += 1
Expand Down Expand Up @@ -1139,7 +1150,6 @@ def write_updates(self, sink):
self._metadata["fEstimate"],
),
)
sink.flush()

for datum in self._branch_data:
if datum["kind"] == "record":
Expand Down Expand Up @@ -1177,16 +1187,33 @@ def write_updates(self, sink):
),
)

fBasketBytes = uproot._util.tobytes(datum["fBasketBytes"])
fBasketEntry = uproot._util.tobytes(datum["fBasketEntry"])
fBasketSeek = uproot._util.tobytes(datum["fBasketSeek"])
start, stop = datum["arrays_write_start"], datum["arrays_write_stop"]

fBasketBytes_part = uproot._util.tobytes(datum["fBasketBytes"][start:stop])
fBasketEntry_part = uproot._util.tobytes(
datum["fBasketEntry"][start : stop + 1]
)
fBasketSeek_part = uproot._util.tobytes(datum["fBasketSeek"][start:stop])

position = base + datum["basket_metadata_start"] + 1
sink.write(position, fBasketBytes)
position += len(fBasketBytes) + 1
sink.write(position, fBasketEntry)
position += len(fBasketEntry) + 1
sink.write(position, fBasketSeek)
position += datum["fBasketBytes"][:start].nbytes
sink.write(position, fBasketBytes_part)
position += len(fBasketBytes_part)
position += datum["fBasketBytes"][stop:].nbytes

position += 1
position += datum["fBasketEntry"][:start].nbytes
sink.write(position, fBasketEntry_part)
position += len(fBasketEntry_part)
position += datum["fBasketEntry"][stop + 1 :].nbytes

position += 1
position += datum["fBasketSeek"][:start].nbytes
sink.write(position, fBasketSeek_part)
position += len(fBasketSeek_part)
position += datum["fBasketSeek"][stop:].nbytes

datum["arrays_write_start"] = datum["arrays_write_stop"]

if datum["kind"] == "counter":
position = (
Expand All @@ -1203,6 +1230,8 @@ def write_updates(self, sink):
),
)

sink.flush()

def write_np_basket(self, sink, branch_name, compression, array):
fClassName = uproot.serialization.string("TBasket")
fName = uproot.serialization.string(branch_name)
Expand Down
12 changes: 3 additions & 9 deletions src/uproot/writing/writable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1716,15 +1716,9 @@ def extend(self, data):
my_directory["tree6"].extend({"branch1": another_numpy_array,
"branch2": another_awkward_array})
Be sure to make these extensions as large as is feasible within memory constraints,
because a ROOT file full of small TBaskets is bloated (larger than it needs to be)
and slow to read (especially for Uproot, but also for ROOT).
For instance, if you want to write a million events and have enough memory
available to do that 100 thousand events at a time (total of 10 TBaskets),
then do so. Filling the TTree a hundred events at a time (total of 10000 TBaskets)
would be considerably slower for writing and reading, and the file would be much
larger than it could otherwise be, even with compression.
.. warning::
**As a word of warning,** be sure that each call to :ref:`uproot.writing.writable.WritableTree.extend` includes at least 100 kB per branch/array. (NumPy and Awkward Arrays have an `nbytes <https://numpy.org/doc/stable/reference/generated/numpy.ndarray.nbytes.html>`__ property; you want at least ``100000`` per array.) If you ask Uproot to write very small TBaskets, it will spend more time working on TBasket overhead than actually writing data. The absolute worst case is one-entry-per-:ref:`uproot.writing.writable.WritableTree.extend`. See `#428 (comment) <https://github.com/scikit-hep/uproot4/pull/428#issuecomment-908703486>`__.
"""
self._cascading.extend(self._file, self._file.sink, data)

Expand Down

0 comments on commit 5ad18cf

Please sign in to comment.