From c0c54ba9202a1e8a3921f2917f1ba5a971f3f008 Mon Sep 17 00:00:00 2001 From: Felix Soubelet <19598248+fsoubelet@users.noreply.github.com> Date: Thu, 15 Aug 2024 17:22:23 +0200 Subject: [PATCH] Use dicts instead of OrderedDicts for headers (#133) --- CHANGELOG.md | 5 ++++- README.md | 10 +++++++--- tests/test_frame.py | 35 +++++++++++++++++------------------ tfs/__init__.py | 2 +- tfs/frame.py | 31 ++++++++++++++++--------------- tfs/reader.py | 9 ++++----- tfs/writer.py | 3 +-- 7 files changed, 50 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e9e47d5..d3074f3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # TFS-Pandas Changelog -## IN PROGRESS - 3.9.0 +## Version 3.8.2 + +- Changed: + - The headers of a `TfsDataFrame` are now stored as a `dict` and no longer an `OrderedDict`. This is transparent to the user. - Fixed: - Removed a workaround function which is no longer necessary due to the higher minimum `pandas` version. diff --git a/README.md b/README.md index a971f058..3521c1bb 100644 --- a/README.md +++ b/README.md @@ -4,24 +4,26 @@ [![Code Climate coverage](https://img.shields.io/codeclimate/coverage/pylhc/tfs.svg?style=popout)](https://codeclimate.com/github/pylhc/tfs) [![Code Climate maintainability (percentage)](https://img.shields.io/codeclimate/maintainability-percentage/pylhc/tfs.svg?style=popout)](https://codeclimate.com/github/pylhc/tfs) -[![PyPI Version](https://img.shields.io/pypi/v/tfs-pandas?label=PyPI&logo=pypi)](https://pypi.org/project/tfs-pandas/) [![GitHub release](https://img.shields.io/github/v/release/pylhc/tfs?logo=github)](https://github.com/pylhc/tfs/) +[![PyPI Version](https://img.shields.io/pypi/v/tfs-pandas?label=PyPI&logo=pypi)](https://pypi.org/project/tfs-pandas/) [![Conda-forge Version](https://img.shields.io/conda/vn/conda-forge/tfs-pandas?color=orange&logo=anaconda)](https://anaconda.org/conda-forge/tfs-pandas) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5070986.svg)](https://doi.org/10.5281/zenodo.5070986) -This package provides reading and writing functionality for [**Table Format System (TFS)** files](http://mad.web.cern.ch/mad/madx.old/Introduction/tfs.html). -Files are read into a `TfsDataFrame`, a class built on top of the famous `pandas.DataFrame`, which in addition to the normal behavior attaches an `OrderedDict` of headers to the `DataFrame`. +This package provides reading and writing functionality for [**Table Format System (TFS)** files](http://mad.web.cern.ch/mad/madx.old/Introduction/tfs.html). +Files are read into a `TfsDataFrame`, a class built on top of the famous `pandas.DataFrame`, which in addition to the normal behavior attaches a dictionary of headers to the `DataFrame`. See the [API documentation](https://pylhc.github.io/tfs/) for details. ## Installing Installation is easily done via `pip`: + ```bash python -m pip install tfs-pandas ``` One can also install in a `conda`/`mamba` environment via the `conda-forge` channel with: + ```bash conda install -c conda-forge tfs-pandas ``` @@ -29,6 +31,7 @@ conda install -c conda-forge tfs-pandas ## Example Usage The package is imported as `tfs`, and exports top-level functions for reading and writing: + ```python import tfs @@ -50,6 +53,7 @@ tfs.write("path_to_output.tfs", data_frame, save_index="index_column") ``` Reading and writing compressed files is also supported, and done automatically based on the provided file extension: + ```python import tfs diff --git a/tests/test_frame.py b/tests/test_frame.py index 8d7ee87e..7e84399c 100644 --- a/tests/test_frame.py +++ b/tests/test_frame.py @@ -1,5 +1,4 @@ import pathlib -from collections import OrderedDict from functools import partial, reduce import pandas as pd @@ -21,8 +20,8 @@ def test_validate_raises_on_wrong_unique_behavior(self): @pytest.mark.parametrize("how", ["invalid", "not_left", "not_right"]) def test_merge_headers_raises_on_invalid_how_key(self, how): - headers_left = OrderedDict() - headers_right = OrderedDict() + headers_left = {} + headers_right = {} with pytest.raises(ValueError, match="Invalid 'how' argument"): merge_headers(headers_left, headers_right, how=how) @@ -49,7 +48,7 @@ def test_correct_merging(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, how_hea result = dframe_x.merge(dframe_y, how_headers=how_headers, how=how, on=on) assert isinstance(result, TfsDataFrame) - assert isinstance(result.headers, OrderedDict) + assert isinstance(result.headers, dict) assert_dict_equal(result.headers, merge_headers(dframe_x.headers, dframe_y.headers, how=how_headers)) assert_frame_equal(result, pd.DataFrame(dframe_x).merge(pd.DataFrame(dframe_y), how=how, on=on)) @@ -64,10 +63,10 @@ def test_merging_accepts_pandas_dataframe( result = dframe_x.merge(dframe_y, how_headers=how_headers, how=how, on=on) assert isinstance(result, TfsDataFrame) - assert isinstance(result.headers, OrderedDict) + assert isinstance(result.headers, dict) - # using empty OrderedDict here as it's what dframe_y is getting when converted in the call - assert_dict_equal(result.headers, merge_headers(dframe_x.headers, OrderedDict(), how=how_headers)) + # using empty dict here as it's what dframe_y is getting when converted in the call + assert_dict_equal(result.headers, merge_headers(dframe_x.headers, headers_right={}, how=how_headers)) assert_frame_equal(result, pd.DataFrame(dframe_x).merge(pd.DataFrame(dframe_y), how=how, on=on)) @@ -78,7 +77,7 @@ def test_headers_merging_left(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, ho headers_right = tfs.read(_tfs_file_y_pathlib).headers result = merge_headers(headers_left, headers_right, how=how) - assert isinstance(result, OrderedDict) + assert isinstance(result, dict) assert len(result) >= len(headers_left) # no key disappeared assert len(result) >= len(headers_right) # no key disappeared for key in result: # check that we prioritized headers_left's contents @@ -91,7 +90,7 @@ def test_headers_merging_right(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, h headers_right = tfs.read(_tfs_file_y_pathlib).headers result = merge_headers(headers_left, headers_right, how=how) - assert isinstance(result, OrderedDict) + assert isinstance(result, dict) assert len(result) >= len(headers_left) # no key disappeared assert len(result) >= len(headers_right) # no key disappeared for key in result: # check that we prioritized headers_right's contents @@ -103,17 +102,17 @@ def test_headers_merging_none_returns_empty_dict(self, _tfs_file_x_pathlib, _tfs headers_left = tfs.read(_tfs_file_x_pathlib).headers headers_right = tfs.read(_tfs_file_y_pathlib).headers result = merge_headers(headers_left, headers_right, how=how) - assert result == OrderedDict() # giving None returns empty headers + assert result == {} # giving None returns empty headers def test_providing_new_headers_overrides_merging(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib): dframe_x = tfs.read(_tfs_file_x_pathlib) dframe_y = tfs.read(_tfs_file_y_pathlib) - assert dframe_x.merge(right=dframe_y, new_headers={}).headers == OrderedDict() - assert dframe_y.merge(right=dframe_x, new_headers={}).headers == OrderedDict() + assert dframe_x.merge(right=dframe_y, new_headers={}).headers == {} + assert dframe_y.merge(right=dframe_x, new_headers={}).headers == {} - assert tfs.concat([dframe_x, dframe_y], new_headers={}).headers == OrderedDict() - assert tfs.concat([dframe_y, dframe_x], new_headers={}).headers == OrderedDict() + assert tfs.concat([dframe_x, dframe_y], new_headers={}).headers == {} + assert tfs.concat([dframe_y, dframe_x], new_headers={}).headers == {} class TestPrinting: @@ -157,7 +156,7 @@ def test_correct_concatenating(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, h merger = partial(merge_headers, how=how_headers) all_headers = (tfsdframe.headers for tfsdframe in objs) assert isinstance(result, TfsDataFrame) - assert isinstance(result.headers, OrderedDict) + assert isinstance(result.headers, dict) assert_dict_equal(result.headers, reduce(merger, all_headers)) assert_frame_equal(result, pd.concat(objs, axis=axis, join=join)) @@ -175,10 +174,10 @@ def test_concatenating_accepts_pandas_dataframes( merger = partial(merge_headers, how=how_headers) # all_headers = (tfsdframe.headers for tfsdframe in objs) assert isinstance(result, TfsDataFrame) - assert isinstance(result.headers, OrderedDict) + assert isinstance(result.headers, dict) - all_headers = [ # empty OrderedDicts here as it's what objects are getting when converted in the call - dframe.headers if isinstance(dframe, TfsDataFrame) else OrderedDict() for dframe in objs + all_headers = [ # empty dicts here as it's what objects are getting when converted in the call + dframe.headers if isinstance(dframe, TfsDataFrame) else {} for dframe in objs ] assert_dict_equal(result.headers, reduce(merger, all_headers)) assert_frame_equal(result, pd.concat(objs, axis=axis, join=join)) diff --git a/tfs/__init__.py b/tfs/__init__.py index 554c977b..61ae5d07 100644 --- a/tfs/__init__.py +++ b/tfs/__init__.py @@ -11,7 +11,7 @@ __title__ = "tfs-pandas" __description__ = "Read and write tfs files." __url__ = "https://github.com/pylhc/tfs" -__version__ = "3.8.1" +__version__ = "3.8.2" __author__ = "pylhc" __author_email__ = "pylhc@github.com" __license__ = "MIT" diff --git a/tfs/frame.py b/tfs/frame.py index ba4f9389..24f03d2e 100644 --- a/tfs/frame.py +++ b/tfs/frame.py @@ -9,7 +9,6 @@ from __future__ import annotations import logging -from collections import OrderedDict from contextlib import suppress from functools import partial, reduce from typing import TYPE_CHECKING, ClassVar @@ -147,23 +146,25 @@ def merge( return TfsDataFrame(data=dframe, headers=new_headers) -def merge_headers(headers_left: dict, headers_right: dict, how: str) -> OrderedDict: +def merge_headers(headers_left: dict, headers_right: dict, how: str) -> dict: """ Merge headers of two ``TfsDataFrames`` together. Args: - headers_left (dict): Headers of caller (left) ``TfsDataFrame`` when calling ``.append``, ``.join`` or - ``.merge``. Headers of the left (preceeding) ``TfsDataFrame`` when calling ``tfs.frame.concat``. - headers_right (dict): Headers of other (right) ``TfsDataFrame`` when calling ``.append``, ``.join`` - or ``.merge``. Headers of the left (preceeding) ``TfsDataFrame`` when calling - ``tfs.frame.concat``. - how (str): Type of merge to be performed, either **left** or **right**. If **left*, prioritize keys - from **headers_left** in case of duplicate keys. If **right**, prioritize keys from - **headers_right** in case of duplicate keys. Case insensitive. If ``None`` is given, - an empty dictionary will be returned. + headers_left (dict): Headers of caller (left) ``TfsDataFrame`` when calling + ``.append``, ``.join`` or ``.merge``. Headers of the left (preceeding) + ``TfsDataFrame`` when calling ``tfs.frame.concat``. + headers_right (dict): Headers of other (right) ``TfsDataFrame`` when calling + ``.append``, ``.join`` or ``.merge``. Headers of the left (preceeding) + ``TfsDataFrame`` when calling ``tfs.frame.concat``. + how (str): Type of merge to be performed, either **left** or **right**. If + **left**, prioritize keys from **headers_left** in case of duplicate keys. + If **right**, prioritize keys from **headers_right** in case of duplicate + keys. Case-insensitive. If ``None`` is given, an empty dictionary will be + returned. Returns: - A new ``OrderedDict`` as the merge of the two provided dictionaries. + A new dictionary as the merge of the two provided dictionaries. """ accepted_merges: set[str] = {"left", "right", "none"} if str(how).lower() not in accepted_merges: # handles being given None @@ -172,14 +173,14 @@ def merge_headers(headers_left: dict, headers_right: dict, how: str) -> OrderedD LOGGER.debug(f"Merging headers with method '{how}'") if str(how).lower() == "left": # we prioritize the contents of headers_left - result = headers_right.copy() + result: dict = headers_right.copy() result.update(headers_left) elif str(how).lower() == "right": # we prioritize the contents of headers_right - result = headers_left.copy() + result: dict = headers_left.copy() result.update(headers_right) else: # we were given None, result will be an empty dict result = {} - return OrderedDict(result) # so that the TfsDataFrame still has an OrderedDict as header + return result def concat( diff --git a/tfs/reader.py b/tfs/reader.py index 296515a2..c97eb870 100644 --- a/tfs/reader.py +++ b/tfs/reader.py @@ -10,7 +10,6 @@ import logging import pathlib import shlex -from collections import OrderedDict from dataclasses import dataclass import numpy as np @@ -168,7 +167,7 @@ def read_tfs( return tfs_data_frame -def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict: +def read_headers(tfs_file_path: pathlib.Path | str) -> dict: """ Parses the top of the **tfs_file_path** and returns the headers. @@ -178,7 +177,7 @@ def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict: a Path object. Returns: - An ``OrderedDict`` with the headers read from the file. + An dictionary with the headers read from the file. Examples: @@ -207,7 +206,7 @@ def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict: class _TfsMetaData: """A dataclass to encapsulate the metadata read from a TFS file.""" - headers: OrderedDict + headers: dict non_data_lines: int column_names: np.ndarray column_types: np.ndarray @@ -234,7 +233,7 @@ def _read_metadata(tfs_file_path: pathlib.Path | str) -> _TfsMetaData: """ LOGGER.debug("Reading headers and metadata from file") tfs_file_path = pathlib.Path(tfs_file_path) - headers = OrderedDict() + headers = {} column_names = column_types = None # Read the headers, chunk by chunk (line by line) with pandas.read_csv as a diff --git a/tfs/writer.py b/tfs/writer.py index 32ea7490..04985046 100644 --- a/tfs/writer.py +++ b/tfs/writer.py @@ -9,7 +9,6 @@ import logging import pathlib -from collections import OrderedDict import numpy as np import pandas as pd @@ -112,7 +111,7 @@ def write_tfs( try: headers_dict = data_frame.headers except AttributeError: - headers_dict = OrderedDict() + headers_dict = {} data_frame = data_frame.convert_dtypes(convert_integer=False)