Skip to content

Commit

Permalink
Use dicts instead of OrderedDicts for headers (#133)
Browse files Browse the repository at this point in the history
  • Loading branch information
fsoubelet authored Aug 15, 2024
1 parent 7051595 commit c0c54ba
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 45 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# TFS-Pandas Changelog

## IN PROGRESS - 3.9.0
## Version 3.8.2

- Changed:
- The headers of a `TfsDataFrame` are now stored as a `dict` and no longer an `OrderedDict`. This is transparent to the user.

- Fixed:
- Removed a workaround function which is no longer necessary due to the higher minimum `pandas` version.
Expand Down
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,34 @@
[![Code Climate coverage](https://img.shields.io/codeclimate/coverage/pylhc/tfs.svg?style=popout)](https://codeclimate.com/github/pylhc/tfs)
[![Code Climate maintainability (percentage)](https://img.shields.io/codeclimate/maintainability-percentage/pylhc/tfs.svg?style=popout)](https://codeclimate.com/github/pylhc/tfs)
<!-- [![GitHub last commit](https://img.shields.io/github/last-commit/pylhc/tfs.svg?style=popout)](https://github.com/pylhc/tfs/) -->
[![PyPI Version](https://img.shields.io/pypi/v/tfs-pandas?label=PyPI&logo=pypi)](https://pypi.org/project/tfs-pandas/)
[![GitHub release](https://img.shields.io/github/v/release/pylhc/tfs?logo=github)](https://github.com/pylhc/tfs/)
[![PyPI Version](https://img.shields.io/pypi/v/tfs-pandas?label=PyPI&logo=pypi)](https://pypi.org/project/tfs-pandas/)
[![Conda-forge Version](https://img.shields.io/conda/vn/conda-forge/tfs-pandas?color=orange&logo=anaconda)](https://anaconda.org/conda-forge/tfs-pandas)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5070986.svg)](https://doi.org/10.5281/zenodo.5070986)

This package provides reading and writing functionality for [**Table Format System (TFS)** files](http://mad.web.cern.ch/mad/madx.old/Introduction/tfs.html).
Files are read into a `TfsDataFrame`, a class built on top of the famous `pandas.DataFrame`, which in addition to the normal behavior attaches an `OrderedDict` of headers to the `DataFrame`.
This package provides reading and writing functionality for [**Table Format System (TFS)** files](http://mad.web.cern.ch/mad/madx.old/Introduction/tfs.html).
Files are read into a `TfsDataFrame`, a class built on top of the famous `pandas.DataFrame`, which in addition to the normal behavior attaches a dictionary of headers to the `DataFrame`.

See the [API documentation](https://pylhc.github.io/tfs/) for details.

## Installing

Installation is easily done via `pip`:

```bash
python -m pip install tfs-pandas
```

One can also install in a `conda`/`mamba` environment via the `conda-forge` channel with:

```bash
conda install -c conda-forge tfs-pandas
```

## Example Usage

The package is imported as `tfs`, and exports top-level functions for reading and writing:

```python
import tfs

Expand All @@ -50,6 +53,7 @@ tfs.write("path_to_output.tfs", data_frame, save_index="index_column")
```

Reading and writing compressed files is also supported, and done automatically based on the provided file extension:

```python
import tfs

Expand Down
35 changes: 17 additions & 18 deletions tests/test_frame.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pathlib
from collections import OrderedDict
from functools import partial, reduce

import pandas as pd
Expand All @@ -21,8 +20,8 @@ def test_validate_raises_on_wrong_unique_behavior(self):

@pytest.mark.parametrize("how", ["invalid", "not_left", "not_right"])
def test_merge_headers_raises_on_invalid_how_key(self, how):
headers_left = OrderedDict()
headers_right = OrderedDict()
headers_left = {}
headers_right = {}

with pytest.raises(ValueError, match="Invalid 'how' argument"):
merge_headers(headers_left, headers_right, how=how)
Expand All @@ -49,7 +48,7 @@ def test_correct_merging(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, how_hea
result = dframe_x.merge(dframe_y, how_headers=how_headers, how=how, on=on)

assert isinstance(result, TfsDataFrame)
assert isinstance(result.headers, OrderedDict)
assert isinstance(result.headers, dict)
assert_dict_equal(result.headers, merge_headers(dframe_x.headers, dframe_y.headers, how=how_headers))
assert_frame_equal(result, pd.DataFrame(dframe_x).merge(pd.DataFrame(dframe_y), how=how, on=on))

Expand All @@ -64,10 +63,10 @@ def test_merging_accepts_pandas_dataframe(
result = dframe_x.merge(dframe_y, how_headers=how_headers, how=how, on=on)

assert isinstance(result, TfsDataFrame)
assert isinstance(result.headers, OrderedDict)
assert isinstance(result.headers, dict)

# using empty OrderedDict here as it's what dframe_y is getting when converted in the call
assert_dict_equal(result.headers, merge_headers(dframe_x.headers, OrderedDict(), how=how_headers))
# using empty dict here as it's what dframe_y is getting when converted in the call
assert_dict_equal(result.headers, merge_headers(dframe_x.headers, headers_right={}, how=how_headers))
assert_frame_equal(result, pd.DataFrame(dframe_x).merge(pd.DataFrame(dframe_y), how=how, on=on))


Expand All @@ -78,7 +77,7 @@ def test_headers_merging_left(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, ho
headers_right = tfs.read(_tfs_file_y_pathlib).headers
result = merge_headers(headers_left, headers_right, how=how)

assert isinstance(result, OrderedDict)
assert isinstance(result, dict)
assert len(result) >= len(headers_left) # no key disappeared
assert len(result) >= len(headers_right) # no key disappeared
for key in result: # check that we prioritized headers_left's contents
Expand All @@ -91,7 +90,7 @@ def test_headers_merging_right(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, h
headers_right = tfs.read(_tfs_file_y_pathlib).headers
result = merge_headers(headers_left, headers_right, how=how)

assert isinstance(result, OrderedDict)
assert isinstance(result, dict)
assert len(result) >= len(headers_left) # no key disappeared
assert len(result) >= len(headers_right) # no key disappeared
for key in result: # check that we prioritized headers_right's contents
Expand All @@ -103,17 +102,17 @@ def test_headers_merging_none_returns_empty_dict(self, _tfs_file_x_pathlib, _tfs
headers_left = tfs.read(_tfs_file_x_pathlib).headers
headers_right = tfs.read(_tfs_file_y_pathlib).headers
result = merge_headers(headers_left, headers_right, how=how)
assert result == OrderedDict() # giving None returns empty headers
assert result == {} # giving None returns empty headers

def test_providing_new_headers_overrides_merging(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib):
dframe_x = tfs.read(_tfs_file_x_pathlib)
dframe_y = tfs.read(_tfs_file_y_pathlib)

assert dframe_x.merge(right=dframe_y, new_headers={}).headers == OrderedDict()
assert dframe_y.merge(right=dframe_x, new_headers={}).headers == OrderedDict()
assert dframe_x.merge(right=dframe_y, new_headers={}).headers == {}
assert dframe_y.merge(right=dframe_x, new_headers={}).headers == {}

assert tfs.concat([dframe_x, dframe_y], new_headers={}).headers == OrderedDict()
assert tfs.concat([dframe_y, dframe_x], new_headers={}).headers == OrderedDict()
assert tfs.concat([dframe_x, dframe_y], new_headers={}).headers == {}
assert tfs.concat([dframe_y, dframe_x], new_headers={}).headers == {}


class TestPrinting:
Expand Down Expand Up @@ -157,7 +156,7 @@ def test_correct_concatenating(self, _tfs_file_x_pathlib, _tfs_file_y_pathlib, h
merger = partial(merge_headers, how=how_headers)
all_headers = (tfsdframe.headers for tfsdframe in objs)
assert isinstance(result, TfsDataFrame)
assert isinstance(result.headers, OrderedDict)
assert isinstance(result.headers, dict)
assert_dict_equal(result.headers, reduce(merger, all_headers))
assert_frame_equal(result, pd.concat(objs, axis=axis, join=join))

Expand All @@ -175,10 +174,10 @@ def test_concatenating_accepts_pandas_dataframes(
merger = partial(merge_headers, how=how_headers)
# all_headers = (tfsdframe.headers for tfsdframe in objs)
assert isinstance(result, TfsDataFrame)
assert isinstance(result.headers, OrderedDict)
assert isinstance(result.headers, dict)

all_headers = [ # empty OrderedDicts here as it's what objects are getting when converted in the call
dframe.headers if isinstance(dframe, TfsDataFrame) else OrderedDict() for dframe in objs
all_headers = [ # empty dicts here as it's what objects are getting when converted in the call
dframe.headers if isinstance(dframe, TfsDataFrame) else {} for dframe in objs
]
assert_dict_equal(result.headers, reduce(merger, all_headers))
assert_frame_equal(result, pd.concat(objs, axis=axis, join=join))
Expand Down
2 changes: 1 addition & 1 deletion tfs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__title__ = "tfs-pandas"
__description__ = "Read and write tfs files."
__url__ = "https://github.com/pylhc/tfs"
__version__ = "3.8.1"
__version__ = "3.8.2"
__author__ = "pylhc"
__author_email__ = "[email protected]"
__license__ = "MIT"
Expand Down
31 changes: 16 additions & 15 deletions tfs/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from __future__ import annotations

import logging
from collections import OrderedDict
from contextlib import suppress
from functools import partial, reduce
from typing import TYPE_CHECKING, ClassVar
Expand Down Expand Up @@ -147,23 +146,25 @@ def merge(
return TfsDataFrame(data=dframe, headers=new_headers)


def merge_headers(headers_left: dict, headers_right: dict, how: str) -> OrderedDict:
def merge_headers(headers_left: dict, headers_right: dict, how: str) -> dict:
"""
Merge headers of two ``TfsDataFrames`` together.
Args:
headers_left (dict): Headers of caller (left) ``TfsDataFrame`` when calling ``.append``, ``.join`` or
``.merge``. Headers of the left (preceeding) ``TfsDataFrame`` when calling ``tfs.frame.concat``.
headers_right (dict): Headers of other (right) ``TfsDataFrame`` when calling ``.append``, ``.join``
or ``.merge``. Headers of the left (preceeding) ``TfsDataFrame`` when calling
``tfs.frame.concat``.
how (str): Type of merge to be performed, either **left** or **right**. If **left*, prioritize keys
from **headers_left** in case of duplicate keys. If **right**, prioritize keys from
**headers_right** in case of duplicate keys. Case insensitive. If ``None`` is given,
an empty dictionary will be returned.
headers_left (dict): Headers of caller (left) ``TfsDataFrame`` when calling
``.append``, ``.join`` or ``.merge``. Headers of the left (preceeding)
``TfsDataFrame`` when calling ``tfs.frame.concat``.
headers_right (dict): Headers of other (right) ``TfsDataFrame`` when calling
``.append``, ``.join`` or ``.merge``. Headers of the left (preceeding)
``TfsDataFrame`` when calling ``tfs.frame.concat``.
how (str): Type of merge to be performed, either **left** or **right**. If
**left**, prioritize keys from **headers_left** in case of duplicate keys.
If **right**, prioritize keys from **headers_right** in case of duplicate
keys. Case-insensitive. If ``None`` is given, an empty dictionary will be
returned.
Returns:
A new ``OrderedDict`` as the merge of the two provided dictionaries.
A new dictionary as the merge of the two provided dictionaries.
"""
accepted_merges: set[str] = {"left", "right", "none"}
if str(how).lower() not in accepted_merges: # handles being given None
Expand All @@ -172,14 +173,14 @@ def merge_headers(headers_left: dict, headers_right: dict, how: str) -> OrderedD

LOGGER.debug(f"Merging headers with method '{how}'")
if str(how).lower() == "left": # we prioritize the contents of headers_left
result = headers_right.copy()
result: dict = headers_right.copy()
result.update(headers_left)
elif str(how).lower() == "right": # we prioritize the contents of headers_right
result = headers_left.copy()
result: dict = headers_left.copy()
result.update(headers_right)
else: # we were given None, result will be an empty dict
result = {}
return OrderedDict(result) # so that the TfsDataFrame still has an OrderedDict as header
return result


def concat(
Expand Down
9 changes: 4 additions & 5 deletions tfs/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import logging
import pathlib
import shlex
from collections import OrderedDict
from dataclasses import dataclass

import numpy as np
Expand Down Expand Up @@ -168,7 +167,7 @@ def read_tfs(
return tfs_data_frame


def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict:
def read_headers(tfs_file_path: pathlib.Path | str) -> dict:
"""
Parses the top of the **tfs_file_path** and returns the headers.
Expand All @@ -178,7 +177,7 @@ def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict:
a Path object.
Returns:
An ``OrderedDict`` with the headers read from the file.
An dictionary with the headers read from the file.
Examples:
Expand Down Expand Up @@ -207,7 +206,7 @@ def read_headers(tfs_file_path: pathlib.Path | str) -> OrderedDict:
class _TfsMetaData:
"""A dataclass to encapsulate the metadata read from a TFS file."""

headers: OrderedDict
headers: dict
non_data_lines: int
column_names: np.ndarray
column_types: np.ndarray
Expand All @@ -234,7 +233,7 @@ def _read_metadata(tfs_file_path: pathlib.Path | str) -> _TfsMetaData:
"""
LOGGER.debug("Reading headers and metadata from file")
tfs_file_path = pathlib.Path(tfs_file_path)
headers = OrderedDict()
headers = {}
column_names = column_types = None

# Read the headers, chunk by chunk (line by line) with pandas.read_csv as a
Expand Down
3 changes: 1 addition & 2 deletions tfs/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import logging
import pathlib
from collections import OrderedDict

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -112,7 +111,7 @@ def write_tfs(
try:
headers_dict = data_frame.headers
except AttributeError:
headers_dict = OrderedDict()
headers_dict = {}

data_frame = data_frame.convert_dtypes(convert_integer=False)

Expand Down

0 comments on commit c0c54ba

Please sign in to comment.