From 4ca002e782a5688948a5e6a0c6fc7d52d0bdb2da Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Wed, 10 Jul 2024 10:21:55 +0200 Subject: [PATCH 1/8] add histogram LGDO --- pyproject.toml | 1 + src/lgdo/__init__.py | 4 + src/lgdo/lh5/_serializers/__init__.py | 2 + src/lgdo/lh5/_serializers/read/composite.py | 42 +++ src/lgdo/lh5/_serializers/write/composite.py | 9 +- src/lgdo/lh5/datatype.py | 1 + src/lgdo/lh5_store.py | 1 + src/lgdo/types/__init__.py | 3 + src/lgdo/types/histogram.py | 259 +++++++++++++++++++ tests/lh5/test_lh5_datatype.py | 1 + tests/lh5/test_lh5_write.py | 63 +++++ tests/types/test_histogram.py | 138 ++++++++++ 12 files changed, 523 insertions(+), 1 deletion(-) create mode 100644 src/lgdo/types/histogram.py create mode 100644 tests/types/test_histogram.py diff --git a/pyproject.toml b/pyproject.toml index 70c84976..03efa6e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "colorlog", "h5py>=3.2", "hdf5plugin", + "hist", "numba!=0.53.*,!=0.54.*", "numexpr", "numpy>=1.21", diff --git a/src/lgdo/__init__.py b/src/lgdo/__init__.py index 5dd46a7b..cb7c30d7 100644 --- a/src/lgdo/__init__.py +++ b/src/lgdo/__init__.py @@ -50,6 +50,8 @@ ArrayOfEncodedEqualSizedArrays, ArrayOfEqualSizedArrays, FixedSizeArray, + Histogram, + HistogramAxis, Scalar, Struct, Table, @@ -63,6 +65,8 @@ "ArrayOfEqualSizedArrays", "ArrayOfEncodedEqualSizedArrays", "FixedSizeArray", + "Histogram", + "HistogramAxis", "LGDO", "Scalar", "Struct", diff --git a/src/lgdo/lh5/_serializers/__init__.py b/src/lgdo/lh5/_serializers/__init__.py index cf020e40..a4a8d6c0 100644 --- a/src/lgdo/lh5/_serializers/__init__.py +++ b/src/lgdo/lh5/_serializers/__init__.py @@ -7,6 +7,7 @@ _h5_read_ndarray, ) from .read.composite import ( + _h5_read_histogram, _h5_read_lgdo, _h5_read_struct, _h5_read_table, @@ -32,6 +33,7 @@ "_h5_read_array_of_equalsized_arrays", "_h5_read_struct", "_h5_read_table", + "_h5_read_histogram", "_h5_read_scalar", "_h5_read_array_of_encoded_equalsized_arrays", "_h5_read_vector_of_encoded_vectors", diff --git a/src/lgdo/lh5/_serializers/read/composite.py b/src/lgdo/lh5/_serializers/read/composite.py index 6c0e9ba3..16697714 100644 --- a/src/lgdo/lh5/_serializers/read/composite.py +++ b/src/lgdo/lh5/_serializers/read/composite.py @@ -13,6 +13,7 @@ ArrayOfEncodedEqualSizedArrays, ArrayOfEqualSizedArrays, FixedSizeArray, + Histogram, Scalar, Struct, Table, @@ -168,6 +169,17 @@ def _h5_read_lgdo( decompress=decompress, ) + if lgdotype is Histogram: + return _h5_read_histogram( + h5o, + start_row=start_row, + n_rows=n_rows, + idx=idx, + use_h5idx=use_h5idx, + field_mask=field_mask, + decompress=decompress, + ) + if lgdotype is ArrayOfEncodedEqualSizedArrays: return _h5_read_array_of_encoded_equalsized_arrays( h5o, @@ -385,3 +397,33 @@ def _h5_read_table( utils.check_obj_buf_attrs(obj_buf.attrs, attrs, h5g) return obj_buf, n_rows_read + + +def _h5_read_histogram( + h5g, + start_row=0, + n_rows=sys.maxsize, + idx=None, + use_h5idx=False, + field_mask=None, + decompress=True, +): + struct, n_rows_read = _h5_read_struct( + h5g, + start_row, + n_rows, + idx, + use_h5idx, + field_mask, + decompress, + ) + isdensity = struct.isdensity.value + binning = [ + (a.binedges.first, a.binedges.last, a.binedges.step, a.closedleft) + for _, a in struct.binning.items() + ] + binning = [tuple(v.value for v in b) for b in binning] + weights = struct.weights.view_as("np") + histogram = Histogram(weights, binning, isdensity) + + return histogram, n_rows_read diff --git a/src/lgdo/lh5/_serializers/write/composite.py b/src/lgdo/lh5/_serializers/write/composite.py index afe5e5ae..8b9bbf7e 100644 --- a/src/lgdo/lh5/_serializers/write/composite.py +++ b/src/lgdo/lh5/_serializers/write/composite.py @@ -63,8 +63,15 @@ def _h5_write_lgdo( msg = f"can't overwrite '{name}' in wo_mode 'write_safe'" raise LH5EncodeError(msg, lh5_file, group, name) - # struct or table or waveform table + # struct, table, waveform table or histogram. if isinstance(obj, types.Struct): + if isinstance(obj, types.Histogram) and wo_mode not in ["w", "o", "of"]: + msg = f"can't append-write histogram in wo_mode '{wo_mode}'" + raise LH5EncodeError(msg, lh5_file, group, name) + if isinstance(obj, types.Histogram) and write_start != 0: + msg = f"can't write histogram in wo_mode '{wo_mode}' with write_start != 0" + raise LH5EncodeError(msg, lh5_file, group, name) + return _h5_write_struct( obj, name, diff --git a/src/lgdo/lh5/datatype.py b/src/lgdo/lh5/datatype.py index c785d814..ed998388 100644 --- a/src/lgdo/lh5/datatype.py +++ b/src/lgdo/lh5/datatype.py @@ -14,6 +14,7 @@ lgdo.ArrayOfEncodedEqualSizedArrays, r"^array_of_encoded_equalsized_arrays<1,1>\{.+\}$", ), + (lgdo.Histogram, r"^struct\{binning,weights,isdensity\}$"), (lgdo.Struct, r"^struct\{.*\}$"), (lgdo.Table, r"^table\{.*\}$"), (lgdo.FixedSizeArray, r"^fixedsize_array<\d+>\{.+\}$"), diff --git a/src/lgdo/lh5_store.py b/src/lgdo/lh5_store.py index dddfc15c..dc8d1d95 100644 --- a/src/lgdo/lh5_store.py +++ b/src/lgdo/lh5_store.py @@ -20,6 +20,7 @@ ArrayOfEncodedEqualSizedArrays, # noqa: F401 ArrayOfEqualSizedArrays, # noqa: F401 FixedSizeArray, # noqa: F401 + Histogram, # noqa: F401 Scalar, Struct, Table, # noqa: F401 diff --git a/src/lgdo/types/__init__.py b/src/lgdo/types/__init__.py index f86a8c8f..fa92c2ed 100644 --- a/src/lgdo/types/__init__.py +++ b/src/lgdo/types/__init__.py @@ -6,6 +6,7 @@ from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays from .encoded import ArrayOfEncodedEqualSizedArrays, VectorOfEncodedVectors from .fixedsizearray import FixedSizeArray +from .histogram import Histogram, HistogramAxis from .lgdo import LGDO from .scalar import Scalar from .struct import Struct @@ -18,6 +19,8 @@ "ArrayOfEqualSizedArrays", "ArrayOfEncodedEqualSizedArrays", "FixedSizeArray", + "Histogram", + "HistogramAxis", "LGDO", "Scalar", "Struct", diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py new file mode 100644 index 00000000..410f9032 --- /dev/null +++ b/src/lgdo/types/histogram.py @@ -0,0 +1,259 @@ +from __future__ import annotations + +from typing import Any + +import hist +import numpy as np + +from .array import Array +from .lgdo import LGDO +from .scalar import Scalar +from .struct import Struct + + +class HistogramAxis(Struct): + def __init__( + self, first: float, last: float, step: float, closedleft: bool = True + ) -> None: + """ + A special struct to group axis parameters for use in a :class:`Histogram`. + + Parameters + ---------- + first + left edge of the leftmost bin + last + right edge of the rightmost bin + step + step size (width of each bin) + closedleft + if True, the bin intervals are left-closed :math:`[a,b)`; + if False, intervals are right-closed :math:`(a,b]`. + """ + super().__init__( + { + "binedges": Struct( + {"first": Scalar(first), "last": Scalar(last), "step": Scalar(step)} + ), + "closedleft": Scalar(closedleft), + }, + ) + + @classmethod + def from_edges(cls, edges: np.ndarray) -> HistogramAxis: + edge_diff = np.diff(edges) + if np.any(~np.isclose(edge_diff, edge_diff[0])): + msg = "only evenly distributed edges can be converted" + raise ValueError(msg) + return cls(edges[0], edges[-1], edge_diff[0], True) + + @property + def first(self) -> float: + return self["binedges"]["first"].value + + @property + def last(self) -> float: + return self["binedges"]["last"].value + + @property + def step(self) -> float: + return self["binedges"]["step"].value + + @property + def closedleft(self) -> bool: + return self["closedleft"].value + + @property + def nbins(self) -> int: + bins = (self.last - self.first) / self.step + bins_int = int(np.rint(bins)) + assert np.isclose(bins, bins_int) + return bins_int + + def __str__(self) -> str: + return ( + f"first={self.first}, last={self.last}, step={self.step}, " + + f"closedleft={self.closedleft}" + ) + + +class Histogram(Struct): + def __init__( + self, + weights: hist.Hist | np.ndarray, + binning: None + | list[HistogramAxis] + | list[np.ndarray] + | list[tuple[float, float, float, bool]] = None, + isdensity: bool = False, + attrs: dict[str, Any] | None = None, + ) -> None: + """A special struct to contain histogrammed data. + + Parameters + ---------- + weights + An :class:`numpy.ndarray` to be used for this object's internal + array. Note: the array is used directly, not copied; or a :class:`hist.Hist` + object, that will be copied and not used directly. + binning + * has to by None if a :class:`hist.Hist` has been passed as ``weights`` + * can be a list of pre-initialized :class:`HistogramAxes` + * can be a list of tuples, representing arguments to :meth:`HistogramAxes.__init__()` + * can be a list of numpy arrays, as returned by :func:`numpy.histogramdd`. + """ + if isinstance(weights, hist.Hist): + if binning is not None: + msg = "not allowed to pass custom binning if constructing from hist.Hist instance" + raise ValueError(msg) + if isdensity: + msg = "not allowed to pass isdensity=True if constructing from hist.Hist instance" + raise ValueError(msg) + + if weights.sum(flow=True) != weights.sum(flow=False): + msg = "flow bins of hist.Hist cannopt be represented" + raise ValueError(msg) + w = Array(weights.view(flow=False)) + + b = [] + for ax in weights.axes: + if not isinstance(ax, hist.axis.Regular): + msg = "only regular axes of hist.Hist can be converted" + raise ValueError(msg) + b.append(HistogramAxis.from_edges(ax.edges)) + b = self._create_binning(b) + else: + if binning is None: + msg = "need to also pass binning if passing histogram as array" + raise ValueError(msg) + w = Array(weights) + + if all(isinstance(ax, HistogramAxis) for ax in binning): + b = binning + elif all(isinstance(ax, np.ndarray) for ax in binning): + b = [HistogramAxis.from_edges(ax) for ax in binning] + elif all(isinstance(ax, tuple) for ax in binning): + b = [HistogramAxis(*ax) for ax in binning] + else: + msg = "invalid binning object passed" + raise ValueError(msg) + + if len(binning) != len(weights.shape): + msg = "binning and weight dimensions do not match" + raise ValueError(msg) + for i, ax in enumerate(b): + if ax.nbins != weights.shape[i]: + msg = f"bin count does not match weight count along axis {i}" + raise ValueError(msg) + b = self._create_binning(b) + + super().__init__( + {"binning": b, "weights": w, "isdensity": Scalar(isdensity)}, + attrs, + ) + + def _create_binning(self, axes: list[HistogramAxis]) -> Struct: + return Struct({f"axis_{i}": a for i, a in enumerate(axes)}) + + @property + def isdensity(self) -> bool: + return self["isdensity"].value + + @property + def weights(self) -> Array: + return self["weights"] + + @property + def binning(self) -> tuple[HistogramAxis, ...]: + bins = sorted(self["binning"].items()) + assert all(isinstance(v, HistogramAxis) for k, v in bins) + return tuple(v for _, v in bins) + + def __setitem__(self, name: str, obj: LGDO) -> None: + # do not allow for new attributes on this + msg = "histogram fields cannot be mutated" + raise TypeError(msg) + + def __getattr__(self, name: str) -> None: + # do not allow for new attributes on this + msg = "histogram fields cannot be mutated" + raise TypeError(msg) + + def add_field(self, name: str | int, obj: LGDO) -> None: # noqa: ARG002 + msg = "histogram fields cannot be mutated" + raise TypeError(msg) + + def remove_field(self, name: str | int, delete: bool = False) -> None: # noqa: ARG002 + msg = "histogram fields cannot be mutated" + raise TypeError(msg) + + def __str__(self) -> str: + string = "{\n" + for k, v in enumerate(self.binning): + string += f" 'axis_{k}': {v},\n" + string += "}" + + attrs = self.getattrs() + if attrs: + string += f" with attrs={attrs}" + + return string + + def view_as( + self, + library: str, + ) -> tuple[np.ndarray] | hist.Hist: + r"""View the histogram data as a third-party format data structure. + + This is typically a zero-copy or nearly zero-copy operation. + + Supported third-party formats are: + + - ``np``: returns a tuple of binning and an :class:`np.ndarray` + - ``hist``: returns an :class:`hist.Hist` + + Notes + ----- + For the return value of the ``np`` output type, see :func:`numpy.histogramdd`. + + Parameters + ---------- + library + format of the returned data view. + + See Also + -------- + .LGDO.view_as + """ + if library == "hist": + if self.isdensity: + msg = "hist.Hist cannot represent density histograms" + raise ValueError(msg) + + hist_axes = [] + for a in self.binning: + if not a.closedleft: + msg = "hist.Hist cannot represent right-closed intervals" + raise ValueError(msg) + hist_axes.append( + hist.axis.Regular( + bins=a.nbins, + start=a.first, + stop=a.last, + underflow=False, + overflow=False, + ) + ) + + hist_hist = hist.Hist(*hist_axes) + hist_hist[:] = self.weights.view_as("np") + return hist_hist + + if library == "np": + edges = [] + for a in self.binning: + edges.append(np.linspace(a.first, a.last, a.nbins)) + return self.weights.view_as("np"), tuple(edges) + + msg = f"{library!r} is not a supported third-party format." + raise TypeError(msg) diff --git a/tests/lh5/test_lh5_datatype.py b/tests/lh5/test_lh5_datatype.py index 36651324..21799221 100644 --- a/tests/lh5/test_lh5_datatype.py +++ b/tests/lh5/test_lh5_datatype.py @@ -33,6 +33,7 @@ def test_datatype2lgdo(): assert d("struct{a,b,c,d}") == types.Struct assert d("struct{}") == types.Struct assert d("table{a,b,c,d}") == types.Table + assert d("struct{binning,weights,isdensity}") == types.Histogram def test_utils(): diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py index d0023770..e4152b29 100644 --- a/tests/lh5/test_lh5_write.py +++ b/tests/lh5/test_lh5_write.py @@ -391,3 +391,66 @@ def test_write_object_append_column(tmptestdir): assert isinstance(tb_dat, types.Table) assert np.array_equal(tb_dat["dset1"].nda, np.zeros(10)) assert np.array_equal(tb_dat["dset2"].nda, np.ones(10)) + + +# Test writing and reading histograms. +def test_write_histogram(caplog, tmptestdir): + caplog.set_level(logging.DEBUG) + caplog.clear() + + # Start with an types.Histogram + if os.path.exists(f"{tmptestdir}/write_histogram_test.lh5"): + os.remove(f"{tmptestdir}/write_histogram_test.lh5") + + h1 = types.Histogram( + np.array([[1, 1], [1, 1]]), (np.array([0, 1, 2]), np.array([2.1, 2.2, 2.3])) + ) + h2 = types.Histogram( + np.array([[10, 10], [10, 10]]), + (np.array([2, 3, 4]), np.array([5, 6, 7])), + isdensity=True, + ) # Same field name, different values + store = lh5.LH5Store() + store.write( + h1, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + group="my_group", + wo_mode="write_safe", + ) + store.write( + h2, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + wo_mode="overwrite", + group="my_group", + ) + + # Now, check that the data were overwritten + h3, _ = store.read( + "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" + ) + assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) + assert h3.binning[0].first == 2 + assert h3.binning[1].last == 7 + assert h3.isdensity + + # Now, check that writing with other modes throws. + for disallowed_wo_mode in ["append", "append_column"]: + with pytest.raises(lh5.exceptions.LH5EncodeError): + store.write( + h2, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + wo_mode=disallowed_wo_mode, + group="my_group", + ) + with pytest.raises(lh5.exceptions.LH5EncodeError): + store.write( + h2, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + wo_mode="overwrite", + write_start=1, + group="my_group", + ) diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py new file mode 100644 index 00000000..87383578 --- /dev/null +++ b/tests/types/test_histogram.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import hist +import numpy as np +import pytest + +from lgdo import Histogram, HistogramAxis, Scalar + + +def test_init_hist(): + h = hist.Hist( + hist.axis.Regular(bins=10, start=0, stop=1, name="x"), + hist.axis.Regular(bins=10, start=0, stop=1, name="y"), + ) + rng = np.random.default_rng() + h.fill(rng.uniform(size=500), rng.uniform(size=500)) + h = Histogram(h, None) + + assert len(h.binning) == 2 + + h = hist.Hist(hist.axis.Integer(start=0, stop=10)) + with pytest.raises(ValueError, match="only regular axes"): + h = Histogram(h) + + h = hist.Hist(hist.axis.Regular(bins=10, start=0, stop=1)) + with pytest.raises(ValueError, match="isdensity=True"): + Histogram(h, isdensity=True) + with pytest.raises(ValueError, match="custom binning"): + Histogram(h, binning=(np.array([0, 1, 2]),)) + + h.fill([-1, 0.8, 2]) + with pytest.raises(ValueError, match="flow bins"): + Histogram(h) + + +def test_init_np(): + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) + assert len(h.binning) == 1 + assert not h.isdensity + + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),), isdensity=True) + assert h.isdensity + + with pytest.raises(ValueError, match="also pass binning"): + h = Histogram(np.array([1, 1]), None) + + with pytest.raises(ValueError, match="dimensions do not match"): + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]), np.array([0, 1, 2]))) + + with pytest.raises(ValueError, match="bin count does not match weight count"): + h = Histogram( + np.array([[1, 1], [1, 1]]), (np.array([0, 1, 2]), np.array([0, 1])) + ) + + with pytest.raises(ValueError, match="only evenly"): + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + + +def test_datatype_name(): + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) + assert h.datatype_name() == "struct" + assert h.form_datatype() == "struct{binning,weights,isdensity}" + + +def test_axes(): + h = Histogram( + np.array([[1, 1], [1, 1]]), + (np.array([0, 1, 2]), np.array([2.1, 2.2, 2.3])), + attrs={"units": "m"}, + ) + assert len(h.binning) == 2 + str(h) + + ax0 = h.binning[0] + assert ax0.first == 0 + assert ax0.last == 2 + assert ax0.step == 1 + assert isinstance(ax0.nbins, int) + assert str(ax0) == "first=0, last=2, step=1, closedleft=True" + + ax1 = h.binning[1] + assert ax1.first == 2.1 + assert ax1.last == 2.3 + assert np.isclose(ax1.step, 0.1) + assert isinstance(ax0.nbins, int) + + h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + ax0 = h.binning[0] + str(h) + + h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + ax0 = h.binning[0] + str(h) + + with pytest.raises(ValueError, match="invalid binning object"): + h = Histogram(np.array([[1, 1], [1, 1]]), (range(2), range(2))) + + h = Histogram( + np.array([[1, 1], [1, 1]]), + [HistogramAxis(1, 3, 1, True), HistogramAxis(4, 6, 1, False)], + ) + + with pytest.raises(ValueError, match="invalid binning object"): + h = Histogram( + np.array([[1, 1], [1, 1]]), + [(1, 3, 1, True), HistogramAxis(4, 6, 1, False)], + ) + + +def test_view_as_hist(): + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) + h.view_as("hist") + + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),), isdensity=True) + with pytest.raises(ValueError, match="cannot represent density"): + h.view_as("hist") + + h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + with pytest.raises(ValueError, match="cannot represent right-closed"): + h.view_as("hist") + + +def test_view_as_np(): + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) + h.view_as("np") + + +def test_not_like_table(): + h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) + assert h.form_datatype() == "struct{binning,weights,isdensity}" + with pytest.raises(TypeError): + x = h.x # noqa: F841 + with pytest.raises(TypeError): + h["x"] = Scalar(1.0) + with pytest.raises(TypeError): + h.add_field("x", Scalar(1.0)) + with pytest.raises(TypeError): + h.remove_field("binning") From 810e53627830d511ebba2e6f0f03bccea8f6e0fc Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Wed, 10 Jul 2024 16:32:40 +0200 Subject: [PATCH 2/8] fix table docs --- src/lgdo/types/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lgdo/types/table.py b/src/lgdo/types/table.py index 24726dfa..81c43bf3 100644 --- a/src/lgdo/types/table.py +++ b/src/lgdo/types/table.py @@ -168,7 +168,7 @@ def add_column(self, name: str, obj: LGDO, use_obj_size: bool = False) -> None: self.add_field(name, obj, use_obj_size=use_obj_size) def remove_column(self, name: str, delete: bool = False) -> None: - """Alias for :meth:`.remove_field` using table terminology 'column'.""" + """Alias for :meth:`Struct.remove_field` using table terminology 'column'.""" super().remove_field(name, delete) def join( From b4296d47db8b6758863dd4c07f2f13a785c34d5f Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Thu, 11 Jul 2024 14:12:06 +0200 Subject: [PATCH 3/8] implement discussed improvements --- src/lgdo/__init__.py | 2 - src/lgdo/types/__init__.py | 3 +- src/lgdo/types/histogram.py | 181 +++++++++++++++++++--------------- tests/types/test_histogram.py | 14 ++- 4 files changed, 111 insertions(+), 89 deletions(-) diff --git a/src/lgdo/__init__.py b/src/lgdo/__init__.py index cb7c30d7..47c721dd 100644 --- a/src/lgdo/__init__.py +++ b/src/lgdo/__init__.py @@ -51,7 +51,6 @@ ArrayOfEqualSizedArrays, FixedSizeArray, Histogram, - HistogramAxis, Scalar, Struct, Table, @@ -66,7 +65,6 @@ "ArrayOfEncodedEqualSizedArrays", "FixedSizeArray", "Histogram", - "HistogramAxis", "LGDO", "Scalar", "Struct", diff --git a/src/lgdo/types/__init__.py b/src/lgdo/types/__init__.py index fa92c2ed..0ba55013 100644 --- a/src/lgdo/types/__init__.py +++ b/src/lgdo/types/__init__.py @@ -6,7 +6,7 @@ from .arrayofequalsizedarrays import ArrayOfEqualSizedArrays from .encoded import ArrayOfEncodedEqualSizedArrays, VectorOfEncodedVectors from .fixedsizearray import FixedSizeArray -from .histogram import Histogram, HistogramAxis +from .histogram import Histogram from .lgdo import LGDO from .scalar import Scalar from .struct import Struct @@ -20,7 +20,6 @@ "ArrayOfEncodedEqualSizedArrays", "FixedSizeArray", "Histogram", - "HistogramAxis", "LGDO", "Scalar", "Struct", diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index 410f9032..14203ec6 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -11,78 +11,90 @@ from .struct import Struct -class HistogramAxis(Struct): - def __init__( - self, first: float, last: float, step: float, closedleft: bool = True - ) -> None: - """ - A special struct to group axis parameters for use in a :class:`Histogram`. - - Parameters - ---------- - first - left edge of the leftmost bin - last - right edge of the rightmost bin - step - step size (width of each bin) - closedleft - if True, the bin intervals are left-closed :math:`[a,b)`; - if False, intervals are right-closed :math:`(a,b]`. - """ - super().__init__( - { - "binedges": Struct( - {"first": Scalar(first), "last": Scalar(last), "step": Scalar(step)} - ), - "closedleft": Scalar(closedleft), - }, - ) - - @classmethod - def from_edges(cls, edges: np.ndarray) -> HistogramAxis: - edge_diff = np.diff(edges) - if np.any(~np.isclose(edge_diff, edge_diff[0])): - msg = "only evenly distributed edges can be converted" - raise ValueError(msg) - return cls(edges[0], edges[-1], edge_diff[0], True) - - @property - def first(self) -> float: - return self["binedges"]["first"].value +class Histogram(Struct): + class Axis(Struct): + def __init__( + self, + first: float, + last: float, + step: float, + closedleft: bool = True, + binedge_attrs: dict[str, Any] | None = None, + ) -> None: + """ + A special struct to group axis parameters for use in a :class:`Histogram`. + + Parameters + ---------- + first + left edge of the leftmost bin + last + right edge of the rightmost bin + step + step size (width of each bin) + closedleft + if True, the bin intervals are left-closed :math:`[a,b)`; + if False, intervals are right-closed :math:`(a,b]`. + binedge_attrs + attributes that will be added to the ``binedges`` LGDO that + is part of the axis struct. + """ + super().__init__( + { + "binedges": Struct( + { + "first": Scalar(first), + "last": Scalar(last), + "step": Scalar(step), + }, + binedge_attrs, + ), + "closedleft": Scalar(closedleft), + }, + ) + + @classmethod + def from_edges(cls, edges: np.ndarray) -> Histogram.Axis: + edge_diff = np.diff(edges) + if np.any(~np.isclose(edge_diff, edge_diff[0])): + msg = "only evenly distributed edges can be converted" + raise ValueError(msg) + return cls(edges[0], edges[-1], edge_diff[0], True) - @property - def last(self) -> float: - return self["binedges"]["last"].value + @property + def first(self) -> float: + return self["binedges"]["first"].value - @property - def step(self) -> float: - return self["binedges"]["step"].value + @property + def last(self) -> float: + return self["binedges"]["last"].value - @property - def closedleft(self) -> bool: - return self["closedleft"].value + @property + def step(self) -> float: + return self["binedges"]["step"].value - @property - def nbins(self) -> int: - bins = (self.last - self.first) / self.step - bins_int = int(np.rint(bins)) - assert np.isclose(bins, bins_int) - return bins_int + @property + def closedleft(self) -> bool: + return self["closedleft"].value - def __str__(self) -> str: - return ( - f"first={self.first}, last={self.last}, step={self.step}, " - + f"closedleft={self.closedleft}" - ) + @property + def nbins(self) -> int: + bins = (self.last - self.first) / self.step + bins_int = int(np.rint(bins)) + assert np.isclose(bins, bins_int) + return bins_int + def __str__(self) -> str: + return ( + f"first={self.first}, last={self.last}, step={self.step}, " + + f"closedleft={self.closedleft}" + ) -class Histogram(Struct): def __init__( self, weights: hist.Hist | np.ndarray, binning: None - | list[HistogramAxis] + | list[Histogram.Axis] | list[np.ndarray] | list[tuple[float, float, float, bool]] = None, isdensity: bool = False, @@ -94,12 +106,13 @@ def __init__( ---------- weights An :class:`numpy.ndarray` to be used for this object's internal - array. Note: the array is used directly, not copied; or a :class:`hist.Hist` - object, that will be copied and not used directly. + array, or a :class:`hist.Hist` object, whose data view is used + for this object's internal array. + Note: the array/histogram view is used directly, not copied binning * has to by None if a :class:`hist.Hist` has been passed as ``weights`` - * can be a list of pre-initialized :class:`HistogramAxes` - * can be a list of tuples, representing arguments to :meth:`HistogramAxes.__init__()` + * can be a list of pre-initialized :class:`Histogram.Axis` + * can be a list of tuples, representing arguments to :meth:`Histogram.Axis.__init__()` * can be a list of numpy arrays, as returned by :func:`numpy.histogramdd`. """ if isinstance(weights, hist.Hist): @@ -111,16 +124,20 @@ def __init__( raise ValueError(msg) if weights.sum(flow=True) != weights.sum(flow=False): - msg = "flow bins of hist.Hist cannopt be represented" + msg = "flow bins of hist.Hist cannot be represented" + raise ValueError(msg) + weights_view = weights.view(flow=False) + if not isinstance(weights_view, np.ndarray): + msg = "only numpy-backed storages can be used in a hist.Hist" raise ValueError(msg) - w = Array(weights.view(flow=False)) + w = Array(weights_view) b = [] for ax in weights.axes: if not isinstance(ax, hist.axis.Regular): msg = "only regular axes of hist.Hist can be converted" raise ValueError(msg) - b.append(HistogramAxis.from_edges(ax.edges)) + b.append(Histogram.Axis.from_edges(ax.edges)) b = self._create_binning(b) else: if binning is None: @@ -128,12 +145,12 @@ def __init__( raise ValueError(msg) w = Array(weights) - if all(isinstance(ax, HistogramAxis) for ax in binning): + if all(isinstance(ax, Histogram.Axis) for ax in binning): b = binning elif all(isinstance(ax, np.ndarray) for ax in binning): - b = [HistogramAxis.from_edges(ax) for ax in binning] + b = [Histogram.Axis.from_edges(ax) for ax in binning] elif all(isinstance(ax, tuple) for ax in binning): - b = [HistogramAxis(*ax) for ax in binning] + b = [Histogram.Axis(*ax) for ax in binning] else: msg = "invalid binning object passed" raise ValueError(msg) @@ -152,7 +169,7 @@ def __init__( attrs, ) - def _create_binning(self, axes: list[HistogramAxis]) -> Struct: + def _create_binning(self, axes: list[Histogram.Axis]) -> Struct: return Struct({f"axis_{i}": a for i, a in enumerate(axes)}) @property @@ -164,9 +181,9 @@ def weights(self) -> Array: return self["weights"] @property - def binning(self) -> tuple[HistogramAxis, ...]: + def binning(self) -> tuple[Histogram.Axis, ...]: bins = sorted(self["binning"].items()) - assert all(isinstance(v, HistogramAxis) for k, v in bins) + assert all(isinstance(v, Histogram.Axis) for k, v in bins) return tuple(v for _, v in bins) def __setitem__(self, name: str, obj: LGDO) -> None: @@ -209,12 +226,14 @@ def view_as( Supported third-party formats are: - - ``np``: returns a tuple of binning and an :class:`np.ndarray` - - ``hist``: returns an :class:`hist.Hist` + - ``np``: returns a tuple of binning and an :class:`np.ndarray`, similar + to the return value of :func:`numpy.histogramdd`. + - ``hist``: returns an :class:`hist.Hist` that holds **a copy** of this + histogram's data. - Notes - ----- - For the return value of the ``np`` output type, see :func:`numpy.histogramdd`. + Warning + ------- + Viewing as ``hist`` will perform a copy of the store histogram data. Parameters ---------- @@ -245,9 +264,7 @@ def view_as( ) ) - hist_hist = hist.Hist(*hist_axes) - hist_hist[:] = self.weights.view_as("np") - return hist_hist + return hist.Hist(*hist_axes, data=self.weights.view_as("np")) if library == "np": edges = [] diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py index 87383578..5637dc20 100644 --- a/tests/types/test_histogram.py +++ b/tests/types/test_histogram.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from lgdo import Histogram, HistogramAxis, Scalar +from lgdo import Histogram, Scalar def test_init_hist(): @@ -32,6 +32,14 @@ def test_init_hist(): with pytest.raises(ValueError, match="flow bins"): Histogram(h) + # assert that the hist data is not copied into the LGDO. + h = hist.Hist(hist.axis.Regular(bins=10, start=0, stop=10)) + h.fill([1, 2, 3]) + hi = Histogram(h) + assert np.sum(hi.weights.nda) == 3 + h.fill([1, 2, 3]) + assert np.sum(hi.weights.nda) == 6 + def test_init_np(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) @@ -97,13 +105,13 @@ def test_axes(): h = Histogram( np.array([[1, 1], [1, 1]]), - [HistogramAxis(1, 3, 1, True), HistogramAxis(4, 6, 1, False)], + [Histogram.Axis(1, 3, 1, True), Histogram.Axis(4, 6, 1, False)], ) with pytest.raises(ValueError, match="invalid binning object"): h = Histogram( np.array([[1, 1], [1, 1]]), - [(1, 3, 1, True), HistogramAxis(4, 6, 1, False)], + [(1, 3, 1, True), Histogram.Axis(4, 6, 1, False)], ) From 79449e64ba4148bebfa1fc4796235df46810c5c3 Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Thu, 11 Jul 2024 16:54:04 +0200 Subject: [PATCH 4/8] implement histograms with variable binning --- src/lgdo/lh5/_serializers/read/composite.py | 16 ++- src/lgdo/types/histogram.py | 135 ++++++++++++++------ tests/lh5/test_lh5_write.py | 46 +++++++ tests/types/test_histogram.py | 103 +++++++++++++-- 4 files changed, 242 insertions(+), 58 deletions(-) diff --git a/src/lgdo/lh5/_serializers/read/composite.py b/src/lgdo/lh5/_serializers/read/composite.py index 16697714..7d3efd6e 100644 --- a/src/lgdo/lh5/_serializers/read/composite.py +++ b/src/lgdo/lh5/_serializers/read/composite.py @@ -418,11 +418,17 @@ def _h5_read_histogram( decompress, ) isdensity = struct.isdensity.value - binning = [ - (a.binedges.first, a.binedges.last, a.binedges.step, a.closedleft) - for _, a in struct.binning.items() - ] - binning = [tuple(v.value for v in b) for b in binning] + binning = [] + for _, a in struct.binning.items(): + be = a.binedges + if isinstance(be, Struct): + b = (None, be.first.value, be.last.value, be.step.value, a.closedleft.value) + elif isinstance(be, Array): + b = (be, None, None, None, a.closedleft.value) + else: + msg = "unexpected binning of histogram" + raise LH5DecodeError(msg, h5g) + binning.append(Histogram.Axis(*b)) weights = struct.weights.view_as("np") histogram = Histogram(weights, binning, isdensity) diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index 14203ec6..e2017df6 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -15,9 +15,10 @@ class Histogram(Struct): class Axis(Struct): def __init__( self, - first: float, - last: float, - step: float, + edges: np.ndarray | Array | None, + first: float | None, + last: float | None, + step: float | None, closedleft: bool = True, binedge_attrs: dict[str, Any] | None = None, ) -> None: @@ -39,38 +40,67 @@ def __init__( attributes that will be added to the ``binedges`` LGDO that is part of the axis struct. """ - super().__init__( - { - "binedges": Struct( - { - "first": Scalar(first), - "last": Scalar(last), - "step": Scalar(step), - }, - binedge_attrs, - ), - "closedleft": Scalar(closedleft), - }, - ) + if edges is not None and ( + first is not None or last is not None or step is not None + ): + msg = "can only construct Axis either from edges or from range" + raise ValueError(msg) + if edges is None and (first is None or last is None or step is None): + msg = "did not pass all range parameters" + raise ValueError(msg) + + if edges is None: + edges = Struct( + { + "first": Scalar(first), + "last": Scalar(last), + "step": Scalar(step), + }, + binedge_attrs, + ) + else: + if not isinstance(edges, Array): + edges = Array(edges, attrs=binedge_attrs) + elif binedge_attrs is not None: + msg = "passed both binedge types.Array instance and binedge_attrs" + raise ValueError(msg) + + if len(edges.nda.shape) != 1: + msg = "must pass an array<1>{real} as edges vector" + raise ValueError(msg) + + super().__init__({"binedges": edges, "closedleft": Scalar(closedleft)}) @classmethod def from_edges(cls, edges: np.ndarray) -> Histogram.Axis: edge_diff = np.diff(edges) if np.any(~np.isclose(edge_diff, edge_diff[0])): - msg = "only evenly distributed edges can be converted" - raise ValueError(msg) - return cls(edges[0], edges[-1], edge_diff[0], True) + return cls(edges, None, None, None, True) + return cls(None, edges[0], edges[-1], edge_diff[0], True) + + @property + def is_range(self) -> bool: + return isinstance(self["binedges"], Struct) @property def first(self) -> float: + if not self.is_range: + msg = "Axis is not a range" + raise TypeError(msg) return self["binedges"]["first"].value @property def last(self) -> float: + if not self.is_range: + msg = "Axis is not a range" + raise TypeError(msg) return self["binedges"]["last"].value @property def step(self) -> float: + if not self.is_range: + msg = "Axis is not a range" + raise TypeError(msg) return self["binedges"]["step"].value @property @@ -79,16 +109,35 @@ def closedleft(self) -> bool: @property def nbins(self) -> int: - bins = (self.last - self.first) / self.step - bins_int = int(np.rint(bins)) - assert np.isclose(bins, bins_int) - return bins_int + if self.is_range: + bins = (self.last - self.first) / self.step + bins_int = int(np.rint(bins)) + assert np.isclose(bins, bins_int) + return bins_int + return len(self["binedges"].nda) - 1 + + @property + def edges(self) -> np.ndarray: + if self.is_range: + return np.linspace(self.first, self.last, self.nbins + 1) + return self["binedges"].nda def __str__(self) -> str: - return ( - f"first={self.first}, last={self.last}, step={self.step}, " - + f"closedleft={self.closedleft}" - ) + thr_orig = np.get_printoptions()["threshold"] + np.set_printoptions(threshold=8) + + if self.is_range: + string = f"first={self.first}, last={self.last}, step={self.step}" + else: + string = f"edges={self.edges}" + string += f", closedleft={self.closedleft}" + + attrs = self["binedges"].getattrs() + if attrs: + string += f" with attrs={attrs}" + + np.set_printoptions(threshold=thr_orig) + return string def __init__( self, @@ -96,7 +145,7 @@ def __init__( binning: None | list[Histogram.Axis] | list[np.ndarray] - | list[tuple[float, float, float, bool]] = None, + | list[tuple[float, float, float]] = None, isdensity: bool = False, attrs: dict[str, Any] | None = None, ) -> None: @@ -112,7 +161,7 @@ def __init__( binning * has to by None if a :class:`hist.Hist` has been passed as ``weights`` * can be a list of pre-initialized :class:`Histogram.Axis` - * can be a list of tuples, representing arguments to :meth:`Histogram.Axis.__init__()` + * can be a list of tuples, representing a range, ``(first, last, step)`` * can be a list of numpy arrays, as returned by :func:`numpy.histogramdd`. """ if isinstance(weights, hist.Hist): @@ -127,15 +176,15 @@ def __init__( msg = "flow bins of hist.Hist cannot be represented" raise ValueError(msg) weights_view = weights.view(flow=False) - if not isinstance(weights_view, np.ndarray): - msg = "only numpy-backed storages can be used in a hist.Hist" + if type(weights_view) is not np.ndarray: + msg = "only simple numpy-backed storages can be used in a hist.Hist" raise ValueError(msg) w = Array(weights_view) b = [] for ax in weights.axes: - if not isinstance(ax, hist.axis.Regular): - msg = "only regular axes of hist.Hist can be converted" + if not isinstance(ax, (hist.axis.Regular, hist.axis.Variable)): + msg = "only regular or variable axes of hist.Hist can be converted" raise ValueError(msg) b.append(Histogram.Axis.from_edges(ax.edges)) b = self._create_binning(b) @@ -150,7 +199,7 @@ def __init__( elif all(isinstance(ax, np.ndarray) for ax in binning): b = [Histogram.Axis.from_edges(ax) for ax in binning] elif all(isinstance(ax, tuple) for ax in binning): - b = [Histogram.Axis(*ax) for ax in binning] + b = [Histogram.Axis(None, *ax, True) for ax in binning] else: msg = "invalid binning object passed" raise ValueError(msg) @@ -254,23 +303,27 @@ def view_as( if not a.closedleft: msg = "hist.Hist cannot represent right-closed intervals" raise ValueError(msg) - hist_axes.append( - hist.axis.Regular( + if a.is_range: + hist_ax = hist.axis.Regular( bins=a.nbins, start=a.first, stop=a.last, underflow=False, overflow=False, ) - ) + else: + hist_ax = hist.axis.Variable( + a.edges, + underflow=False, + overflow=False, + ) + hist_axes.append(hist_ax) return hist.Hist(*hist_axes, data=self.weights.view_as("np")) if library == "np": - edges = [] - for a in self.binning: - edges.append(np.linspace(a.first, a.last, a.nbins)) - return self.weights.view_as("np"), tuple(edges) + edges = tuple([a.edges for a in self.binning]) + return self.weights.view_as("np"), edges msg = f"{library!r} is not a supported third-party format." raise TypeError(msg) diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py index e4152b29..7955e641 100644 --- a/tests/lh5/test_lh5_write.py +++ b/tests/lh5/test_lh5_write.py @@ -454,3 +454,49 @@ def test_write_histogram(caplog, tmptestdir): write_start=1, group="my_group", ) + + +def test_write_histogram_variable(caplog, tmptestdir): + caplog.set_level(logging.DEBUG) + caplog.clear() + + # Start with an types.Histogram + if os.path.exists(f"{tmptestdir}/write_histogram_test.lh5"): + os.remove(f"{tmptestdir}/write_histogram_test.lh5") + + h1 = types.Histogram( + np.array([[1, 1], [1, 1]]), (np.array([0, 1.2, 2]), np.array([2.1, 2.5, 2.3])) + ) + h2 = types.Histogram( + np.array([[10, 10], [10, 10]]), + (np.array([2, 3.5, 4]), np.array([5, 6.5, 7])), + isdensity=True, + ) # Same field name, different values + store = lh5.LH5Store() + store.write( + h1, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + group="my_group", + wo_mode="write_safe", + ) + store.write( + h2, + "my_histogram", + f"{tmptestdir}/write_histogram_test.lh5", + wo_mode="overwrite", + group="my_group", + ) + + # Now, check that the data were overwritten + h3, _ = store.read( + "my_group/my_histogram", f"{tmptestdir}/write_histogram_test.lh5" + ) + assert np.array_equal(h3.weights.nda, np.array([[10, 10], [10, 10]])) + assert np.array_equal(h3.binning[0].edges, np.array([2, 3.5, 4])) + with pytest.raises(TypeError): + x = h3.binning[0].first + with pytest.raises(TypeError): + x = h3.binning[1].last # noqa: F841 + assert not h3.binning[0].is_range + assert h3.isdensity diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py index 5637dc20..f3623aed 100644 --- a/tests/types/test_histogram.py +++ b/tests/types/test_histogram.py @@ -4,10 +4,10 @@ import numpy as np import pytest -from lgdo import Histogram, Scalar +from lgdo import Array, Histogram, Scalar -def test_init_hist(): +def test_init_hist_regular(): h = hist.Hist( hist.axis.Regular(bins=10, start=0, stop=1, name="x"), hist.axis.Regular(bins=10, start=0, stop=1, name="y"), @@ -19,7 +19,7 @@ def test_init_hist(): assert len(h.binning) == 2 h = hist.Hist(hist.axis.Integer(start=0, stop=10)) - with pytest.raises(ValueError, match="only regular axes"): + with pytest.raises(ValueError, match="only regular or variable axes"): h = Histogram(h) h = hist.Hist(hist.axis.Regular(bins=10, start=0, stop=1)) @@ -40,6 +40,25 @@ def test_init_hist(): h.fill([1, 2, 3]) assert np.sum(hi.weights.nda) == 6 + h = hist.Hist( + hist.axis.Regular(bins=10, start=0, stop=10), storage=hist.storage.Mean() + ) + h.fill([1, 2, 3], sample=[4, 4, 4]) + with pytest.raises(ValueError, match="simple numpy-backed storages"): + hi = Histogram(h) + + +def test_init_hist_variable(): + h = hist.Hist( + hist.axis.Variable((0, 0.5, 5), name="x"), + hist.axis.Variable((0, 0.5, 5), name="y"), + ) + rng = np.random.default_rng() + h.fill(rng.uniform(size=500), rng.uniform(size=500)) + h = Histogram(h) + + assert len(h.binning) == 2 + def test_init_np(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) @@ -60,8 +79,7 @@ def test_init_np(): np.array([[1, 1], [1, 1]]), (np.array([0, 1, 2]), np.array([0, 1])) ) - with pytest.raises(ValueError, match="only evenly"): - h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) def test_datatype_name(): @@ -84,6 +102,7 @@ def test_axes(): assert ax0.last == 2 assert ax0.step == 1 assert isinstance(ax0.nbins, int) + assert len(ax0.edges) == 3 assert str(ax0) == "first=0, last=2, step=1, closedleft=True" ax1 = h.binning[1] @@ -92,11 +111,17 @@ def test_axes(): assert np.isclose(ax1.step, 0.1) assert isinstance(ax0.nbins, int) - h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + h = Histogram( + np.array([[1, 1], [1, 1]]), + [(1, 3, 1), (4, 6, 1)], + ) ax0 = h.binning[0] str(h) - h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + h = Histogram( + np.array([[1, 1], [1, 1]]), + [Histogram.Axis(None, 1, 3, 1), Histogram.Axis(None, 4, 6, 1, False)], + ) ax0 = h.binning[0] str(h) @@ -105,32 +130,86 @@ def test_axes(): h = Histogram( np.array([[1, 1], [1, 1]]), - [Histogram.Axis(1, 3, 1, True), Histogram.Axis(4, 6, 1, False)], + [Histogram.Axis(None, 1, 3, 1, True), Histogram.Axis(None, 4, 6, 1, False)], ) with pytest.raises(ValueError, match="invalid binning object"): h = Histogram( np.array([[1, 1], [1, 1]]), - [(1, 3, 1, True), Histogram.Axis(4, 6, 1, False)], + [(1, 3, 1), Histogram.Axis(None, 4, 6, 1, False)], + ) + + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + with pytest.raises(TypeError, match="range"): + x = h.binning[0].first + with pytest.raises(TypeError, match="range"): + x = h.binning[0].last + with pytest.raises(TypeError, match="range"): + x = h.binning[0].step # noqa: F841 + assert h.binning[0].nbins == 3 + assert str(h.binning[0]) == "edges=[0. 1. 2.5 3. ], closedleft=True" + + Histogram.Axis( + np.array([0, 1, 2.5, 3]), None, None, None, binedge_attrs={"units": "m"} + ) + + with pytest.raises(ValueError, match="binedge_attrs"): + Histogram.Axis( + Array(np.array([0, 1, 2.5, 3])), + None, + None, + None, + binedge_attrs={"units": "m"}, ) + with pytest.raises(ValueError, match=r"array<1>\{real\}"): + Histogram.Axis(Array(np.array([[0, 1], [2.5, 3]])), None, None, None) + + ax = Histogram.Axis( + np.array([0, 1, 2.5, 3]), + None, + None, + None, + binedge_attrs={"units": "m"}, + ) + assert ( + str(ax) == "edges=[0. 1. 2.5 3. ], closedleft=True with attrs={'units': 'm'}" + ) + + with pytest.raises(ValueError, match="either from edges or from range"): + Histogram.Axis(np.array([0, 1, 2.5, 3]), 0, 1, None) + with pytest.raises(ValueError, match="all range parameters"): + Histogram.Axis(None, 0, 1, None) def test_view_as_hist(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) - h.view_as("hist") + hi = h.view_as("hist") + assert isinstance(hi.axes[0], hist.axis.Regular) h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),), isdensity=True) with pytest.raises(ValueError, match="cannot represent density"): h.view_as("hist") - h = Histogram(np.array([[1, 1], [1, 1]]), [(1, 3, 1, True), (4, 6, 1, False)]) + h = Histogram( + np.array([[1, 1], [1, 1]]), + [Histogram.Axis(None, 1, 3, 1, True), Histogram.Axis(None, 4, 6, 1, False)], + ) with pytest.raises(ValueError, match="cannot represent right-closed"): h.view_as("hist") + h = Histogram(np.array([1, 1, 1]), (np.array([0, 1, 2.5, 3]),)) + hi = h.view_as("hist") + assert isinstance(hi.axes[0], hist.axis.Variable) + def test_view_as_np(): h = Histogram(np.array([1, 1]), (np.array([0, 1, 2]),)) - h.view_as("np") + assert h.binning[0].is_range + assert h.binning[0].nbins == 2 + nda, axes = h.view_as("np") + assert isinstance(nda, np.ndarray) + assert len(axes) == 1 + assert np.array_equal(axes[0], np.array([0, 1, 2])) def test_not_like_table(): From 6c9701d7e0ce40ccc5a02af02da6ff57b909cdf7 Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Wed, 17 Jul 2024 12:20:47 +0200 Subject: [PATCH 5/8] pass binedge_attrs through Histogram constructor --- src/lgdo/types/histogram.py | 35 +++++++++++++++++++++++++++-------- tests/types/test_histogram.py | 26 ++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index e2017df6..63d8191f 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -62,7 +62,7 @@ def __init__( if not isinstance(edges, Array): edges = Array(edges, attrs=binedge_attrs) elif binedge_attrs is not None: - msg = "passed both binedge types.Array instance and binedge_attrs" + msg = "passed both binedge as Array LGDO instance and binedge_attrs" raise ValueError(msg) if len(edges.nda.shape) != 1: @@ -72,11 +72,13 @@ def __init__( super().__init__({"binedges": edges, "closedleft": Scalar(closedleft)}) @classmethod - def from_edges(cls, edges: np.ndarray) -> Histogram.Axis: + def from_edges( + cls, edges: np.ndarray, binedge_attrs: dict[str, Any] | None = None + ) -> Histogram.Axis: edge_diff = np.diff(edges) if np.any(~np.isclose(edge_diff, edge_diff[0])): - return cls(edges, None, None, None, True) - return cls(None, edges[0], edges[-1], edge_diff[0], True) + return cls(edges, None, None, None, True, binedge_attrs) + return cls(None, edges[0], edges[-1], edge_diff[0], True, binedge_attrs) @property def is_range(self) -> bool: @@ -132,13 +134,24 @@ def __str__(self) -> str: string = f"edges={self.edges}" string += f", closedleft={self.closedleft}" - attrs = self["binedges"].getattrs() + attrs = self.get_binedgeattrs() if attrs: string += f" with attrs={attrs}" np.set_printoptions(threshold=thr_orig) return string + def get_binedgeattrs(self, datatype: bool = False) -> dict: + """Return a copy of the LGDO attributes dictionary of the binedges + + Parameters + ---------- + datatype + if ``False``, remove ``datatype`` attribute from the output + dictionary. + """ + return self["binedges"].getattrs(datatype) + def __init__( self, weights: hist.Hist | np.ndarray, @@ -148,6 +161,7 @@ def __init__( | list[tuple[float, float, float]] = None, isdensity: bool = False, attrs: dict[str, Any] | None = None, + binedge_attrs: dict[str, Any] | None = None, ) -> None: """A special struct to contain histogrammed data. @@ -163,6 +177,8 @@ def __init__( * can be a list of pre-initialized :class:`Histogram.Axis` * can be a list of tuples, representing a range, ``(first, last, step)`` * can be a list of numpy arrays, as returned by :func:`numpy.histogramdd`. + binedge_attrs + attributes that will be added to the all ``binedges`` of all axes. """ if isinstance(weights, hist.Hist): if binning is not None: @@ -186,7 +202,7 @@ def __init__( if not isinstance(ax, (hist.axis.Regular, hist.axis.Variable)): msg = "only regular or variable axes of hist.Hist can be converted" raise ValueError(msg) - b.append(Histogram.Axis.from_edges(ax.edges)) + b.append(Histogram.Axis.from_edges(ax.edges, binedge_attrs)) b = self._create_binning(b) else: if binning is None: @@ -195,11 +211,14 @@ def __init__( w = Array(weights) if all(isinstance(ax, Histogram.Axis) for ax in binning): + if binedge_attrs is not None: + msg = "passed both binedges as Axis instances and binedge_attrs" + raise ValueError(msg) b = binning elif all(isinstance(ax, np.ndarray) for ax in binning): - b = [Histogram.Axis.from_edges(ax) for ax in binning] + b = [Histogram.Axis.from_edges(ax, binedge_attrs) for ax in binning] elif all(isinstance(ax, tuple) for ax in binning): - b = [Histogram.Axis(None, *ax, True) for ax in binning] + b = [Histogram.Axis(None, *ax, True, binedge_attrs) for ax in binning] else: msg = "invalid binning object passed" raise ValueError(msg) diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py index f3623aed..ee45c0a9 100644 --- a/tests/types/test_histogram.py +++ b/tests/types/test_histogram.py @@ -149,6 +149,13 @@ def test_axes(): assert h.binning[0].nbins == 3 assert str(h.binning[0]) == "edges=[0. 1. 2.5 3. ], closedleft=True" + with pytest.raises(ValueError, match="either from edges or from range"): + Histogram.Axis(np.array([0, 1, 2.5, 3]), 0, 1, None) + with pytest.raises(ValueError, match="all range parameters"): + Histogram.Axis(None, 0, 1, None) + + +def test_ax_attributes(): Histogram.Axis( np.array([0, 1, 2.5, 3]), None, None, None, binedge_attrs={"units": "m"} ) @@ -175,10 +182,21 @@ def test_axes(): str(ax) == "edges=[0. 1. 2.5 3. ], closedleft=True with attrs={'units': 'm'}" ) - with pytest.raises(ValueError, match="either from edges or from range"): - Histogram.Axis(np.array([0, 1, 2.5, 3]), 0, 1, None) - with pytest.raises(ValueError, match="all range parameters"): - Histogram.Axis(None, 0, 1, None) + h = Histogram( + np.array([[1, 1], [1, 1]]), + (np.array([0, 1, 2]), np.array([0, 1, 2])), + binedge_attrs={"units": "m"}, + ) + assert str(h.binning[0]).endswith(", closedleft=True with attrs={'units': 'm'}") + assert str(h.binning[1]).endswith(", closedleft=True with attrs={'units': 'm'}") + assert h.binning[0].get_binedgeattrs() == {"units": "m"} + + with pytest.raises(ValueError): + h = Histogram( + np.array([[1, 1], [1, 1]]), + [Histogram.Axis(None, 1, 3, 1, True), Histogram.Axis(None, 4, 6, 1, False)], + binedge_attrs={"units": "m"}, + ) def test_view_as_hist(): From a2f585e66cc87f771aa646ffd6b6e8fc65798522 Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Tue, 30 Jul 2024 17:17:56 +0200 Subject: [PATCH 6/8] fail if trying to load hist from multiple files --- src/lgdo/lh5/_serializers/read/composite.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lgdo/lh5/_serializers/read/composite.py b/src/lgdo/lh5/_serializers/read/composite.py index 7d3efd6e..d5575331 100644 --- a/src/lgdo/lh5/_serializers/read/composite.py +++ b/src/lgdo/lh5/_serializers/read/composite.py @@ -177,6 +177,8 @@ def _h5_read_lgdo( idx=idx, use_h5idx=use_h5idx, field_mask=field_mask, + obj_buf=obj_buf, + obj_buf_start=obj_buf_start, decompress=decompress, ) @@ -406,8 +408,14 @@ def _h5_read_histogram( idx=None, use_h5idx=False, field_mask=None, + obj_buf=None, + obj_buf_start=0, decompress=True, ): + if obj_buf is not None or obj_buf_start != 0: + msg = "reading a histogram into an existing object buffer is not supported" + raise LH5DecodeError(msg, h5g) + struct, n_rows_read = _h5_read_struct( h5g, start_row, From 3dae2c11fd3911ccd8aa9c4a5f79dcc5ba16e22b Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Mon, 12 Aug 2024 13:25:42 +0200 Subject: [PATCH 7/8] requested typing fixes --- src/lgdo/__init__.py | 2 ++ src/lgdo/types/histogram.py | 20 +++++++++++--------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/lgdo/__init__.py b/src/lgdo/__init__.py index 47c721dd..08673f7c 100644 --- a/src/lgdo/__init__.py +++ b/src/lgdo/__init__.py @@ -33,6 +33,8 @@ :class:`dict` * :class:`.Table`: a :class:`.Struct` whose elements ("columns") are all array types with the same length (number of rows) +* :class:`.Histogram`: holds an array of histogrammed data, and the associated + binning of arbitrary dimensionality. Currently the primary on-disk format for LGDO object is LEGEND HDF5 (LH5) files. IO is done via the class :class:`.lh5_store.LH5Store`. LH5 files can also be diff --git a/src/lgdo/types/histogram.py b/src/lgdo/types/histogram.py index 63d8191f..3837c2a3 100644 --- a/src/lgdo/types/histogram.py +++ b/src/lgdo/types/histogram.py @@ -1,9 +1,11 @@ from __future__ import annotations +from collections.abc import Iterable from typing import Any import hist import numpy as np +from numpy.typing import NDArray from .array import Array from .lgdo import LGDO @@ -15,7 +17,7 @@ class Histogram(Struct): class Axis(Struct): def __init__( self, - edges: np.ndarray | Array | None, + edges: NDArray | Array | None, first: float | None, last: float | None, step: float | None, @@ -73,7 +75,7 @@ def __init__( @classmethod def from_edges( - cls, edges: np.ndarray, binedge_attrs: dict[str, Any] | None = None + cls, edges: NDArray, binedge_attrs: dict[str, Any] | None = None ) -> Histogram.Axis: edge_diff = np.diff(edges) if np.any(~np.isclose(edge_diff, edge_diff[0])): @@ -119,7 +121,7 @@ def nbins(self) -> int: return len(self["binedges"].nda) - 1 @property - def edges(self) -> np.ndarray: + def edges(self) -> NDArray: if self.is_range: return np.linspace(self.first, self.last, self.nbins + 1) return self["binedges"].nda @@ -154,11 +156,11 @@ def get_binedgeattrs(self, datatype: bool = False) -> dict: def __init__( self, - weights: hist.Hist | np.ndarray, + weights: hist.Hist | NDArray, binning: None - | list[Histogram.Axis] - | list[np.ndarray] - | list[tuple[float, float, float]] = None, + | Iterable[Histogram.Axis] + | Iterable[NDArray] + | Iterable[tuple[float, float, float]] = None, isdensity: bool = False, attrs: dict[str, Any] | None = None, binedge_attrs: dict[str, Any] | None = None, @@ -237,7 +239,7 @@ def __init__( attrs, ) - def _create_binning(self, axes: list[Histogram.Axis]) -> Struct: + def _create_binning(self, axes: Iterable[Histogram.Axis]) -> Struct: return Struct({f"axis_{i}": a for i, a in enumerate(axes)}) @property @@ -287,7 +289,7 @@ def __str__(self) -> str: def view_as( self, library: str, - ) -> tuple[np.ndarray] | hist.Hist: + ) -> tuple[NDArray] | hist.Hist: r"""View the histogram data as a third-party format data structure. This is typically a zero-copy or nearly zero-copy operation. From ad9684ba20b6fd0b0a8696418ff2b7e2aa99bab7 Mon Sep 17 00:00:00 2001 From: Manuel Huber Date: Mon, 12 Aug 2024 14:04:26 +0200 Subject: [PATCH 8/8] add tests with textdata, fix attributes reading --- src/lgdo/lh5/_serializers/read/composite.py | 6 ++++- tests/conftest.py | 2 +- tests/lh5/test_lh5_write.py | 6 ++++- tests/types/test_histogram.py | 26 ++++++++++++++++++++- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/lgdo/lh5/_serializers/read/composite.py b/src/lgdo/lh5/_serializers/read/composite.py index d5575331..b900146e 100644 --- a/src/lgdo/lh5/_serializers/read/composite.py +++ b/src/lgdo/lh5/_serializers/read/composite.py @@ -436,7 +436,11 @@ def _h5_read_histogram( else: msg = "unexpected binning of histogram" raise LH5DecodeError(msg, h5g) - binning.append(Histogram.Axis(*b)) + ax = Histogram.Axis(*b) + # copy attrs to "clone" the "whole" struct. + ax.attrs = a.getattrs(datatype=True) + ax["binedges"].attrs = be.getattrs(datatype=True) + binning.append(ax) weights = struct.weights.view_as("np") histogram = Histogram(weights, binning, isdensity) diff --git a/tests/conftest.py b/tests/conftest.py index fb79ad1e..033d1ddc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,5 +26,5 @@ def pytest_sessionfinish(session, exitstatus): # noqa: ARG001 @pytest.fixture(scope="session") def lgnd_test_data(): ldata = LegendTestData() - ldata.checkout("8f55832") + ldata.checkout("82ad6c2") return ldata diff --git a/tests/lh5/test_lh5_write.py b/tests/lh5/test_lh5_write.py index 7955e641..f7ed93fc 100644 --- a/tests/lh5/test_lh5_write.py +++ b/tests/lh5/test_lh5_write.py @@ -409,7 +409,10 @@ def test_write_histogram(caplog, tmptestdir): np.array([[10, 10], [10, 10]]), (np.array([2, 3, 4]), np.array([5, 6, 7])), isdensity=True, - ) # Same field name, different values + ) + h2.binning[0]["binedges"].attrs["units"] = "ns" + + # Same field name, different values store = lh5.LH5Store() store.write( h1, @@ -434,6 +437,7 @@ def test_write_histogram(caplog, tmptestdir): assert h3.binning[0].first == 2 assert h3.binning[1].last == 7 assert h3.isdensity + assert h3.binning[0].get_binedgeattrs() == {"units": "ns"} # Now, check that writing with other modes throws. for disallowed_wo_mode in ["append", "append_column"]: diff --git a/tests/types/test_histogram.py b/tests/types/test_histogram.py index ee45c0a9..e48352c6 100644 --- a/tests/types/test_histogram.py +++ b/tests/types/test_histogram.py @@ -4,7 +4,8 @@ import numpy as np import pytest -from lgdo import Array, Histogram, Scalar +from lgdo import Array, Histogram, Scalar, lh5 +from lgdo.lh5.exceptions import LH5DecodeError def test_init_hist_regular(): @@ -241,3 +242,26 @@ def test_not_like_table(): h.add_field("x", Scalar(1.0)) with pytest.raises(TypeError): h.remove_field("binning") + + +def test_read_histogram_testdata(lgnd_test_data): + file = lgnd_test_data.get_path("lh5/lgdo-histograms.lh5") + + h1 = lh5.read("test_histogram_range", file) + assert isinstance(h1, Histogram) + assert h1.binning[0].is_range + + h2 = lh5.read("test_histogram_variable", file) + assert isinstance(h2, Histogram) + assert not h2.binning[0].is_range + + h3 = lh5.read("test_histogram_range_w_attrs", file) + assert isinstance(h3, Histogram) + assert h3.binning[0].is_range + assert h3.binning[0]["binedges"].getattrs() == {"units": "m"} + + +def test_read_histogram_multiple(lgnd_test_data): + file = lgnd_test_data.get_path("lh5/lgdo-histograms.lh5") + with pytest.raises(LH5DecodeError): + lh5.read("test_histogram_range", [file, file])