Skip to content

Commit

Permalink
Merge branch 'main' into to_numpy/pandas_numeric
Browse files Browse the repository at this point in the history
  • Loading branch information
seisman committed Nov 15, 2024
2 parents 4061ec7 + 3d08919 commit cdf7c38
Show file tree
Hide file tree
Showing 17 changed files with 215 additions and 36 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cache_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_doctests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_tests_dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ci_tests_legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:

# Install Micromamba with conda-forge dependencies
- name: Setup Micromamba
uses: mamba-org/[email protected].0
uses: mamba-org/[email protected].1
with:
environment-name: pygmt
condarc: |
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish-to-pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@ jobs:
ls -lh dist/
- name: Publish to Test PyPI
uses: pypa/[email protected].0
uses: pypa/[email protected].2
with:
repository-url: https://test.pypi.org/legacy/

- name: Publish to PyPI
if: startsWith(github.ref, 'refs/tags')
uses: pypa/[email protected].0
uses: pypa/[email protected].2
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
"contextily": ("https://contextily.readthedocs.io/en/stable/", None),
"geopandas": ("https://geopandas.org/en/stable/", None),
"numpy": ("https://numpy.org/doc/stable/", None),
"pyarrow": ("https://arrow.apache.org/docs/", None),
"python": ("https://docs.python.org/3/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
"rasterio": ("https://rasterio.readthedocs.io/en/stable/", None),
Expand Down
6 changes: 3 additions & 3 deletions doc/ecosystem.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ Python objects. They are based on the C++ implementation of Arrow.
```{note}
If you have [PyArrow][] installed, PyGMT does have some initial support for
`pandas.Series` and `pandas.DataFrame` objects with Apache Arrow-backed arrays.
Specifically, only uint/int/float and date32/date64 are supported for now.
Support for string Array dtypes, Duration types and GeoArrow geometry types is still a work in progress.
For more details, see
Specifically, only uint/int/float, date32/date64 and string types are supported for now.
Support for Duration types and GeoArrow geometry types is still a work in progress. For
more details, see
[issue #2800](https://github.com/GenericMappingTools/pygmt/issues/2800).
```

Expand Down
10 changes: 10 additions & 0 deletions pygmt/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,17 @@
Type aliases for type hints.
"""

import contextlib
import importlib
from collections.abc import Sequence
from typing import Literal

import numpy as np

# Anchor codes
AnchorCode = Literal["TL", "TC", "TR", "ML", "MC", "MR", "BL", "BC", "BR"]

# String array types
StringArrayTypes = Sequence[str] | np.ndarray
with contextlib.suppress(ImportError):
StringArrayTypes |= importlib.import_module(name="pyarrow").StringArray
27 changes: 20 additions & 7 deletions pygmt/clib/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Functions to convert data types into ctypes friendly formats.
"""

import contextlib
import ctypes as ctp
import warnings
from collections.abc import Sequence
Expand Down Expand Up @@ -156,14 +157,19 @@ def _to_numpy(data: Any) -> np.ndarray:
array
The C contiguous NumPy array.
"""
# Mapping of unsupported dtypes to the expected NumPy dtype.
dtypes: dict[str, type] = {
"date32[day][pyarrow]": np.datetime64,
"date64[ms][pyarrow]": np.datetime64,
# Mapping of unsupported dtypes to expected NumPy dtypes.
dtypes: dict[str, type | str] = {
# For string dtypes.
"large_string": np.str_, # pa.large_string and pa.large_utf8
"string": np.str_, # pa.string, pa.utf8, pd.StringDtype
"string_view": np.str_, # pa.string_view
# For datetime dtypes.
"date32[day][pyarrow]": "datetime64[D]",
"date64[ms][pyarrow]": "datetime64[ms]",
}

# The expected numpy dtype for the result numpy array, but can be None.
dtype = dtypes.get(str(getattr(data, "dtype", None)))
dtype = dtypes.get(str(getattr(data, "dtype", getattr(data, "type", None))))

# Workarounds for pandas < 2.2. Following SPEC 0, pandas 2.1 should be dropped in
# 2025 Q3, so it's likely we can remove the workaround in PyGMT v0.17.0.
Expand All @@ -184,6 +190,12 @@ def _to_numpy(data: Any) -> np.ndarray:
data = data.to_numpy(na_value=np.nan)

array = np.ascontiguousarray(data, dtype=dtype)

# Check if a np.object_ array can be converted to np.str_.
if array.dtype == np.object_:
with contextlib.suppress(TypeError, ValueError):
return np.ascontiguousarray(array, dtype=np.str_)

return array


Expand Down Expand Up @@ -284,12 +296,13 @@ def sequence_to_ctypes_array(

def strings_to_ctypes_array(strings: Sequence[str] | np.ndarray) -> ctp.Array:
"""
Convert a sequence (e.g., a list) of strings into a ctypes array.
Convert a sequence (e.g., a list) of strings or numpy.ndarray of strings into a
ctypes array.
Parameters
----------
strings
A sequence of strings.
A sequence of strings, or a numpy.ndarray of str dtype.
Returns
-------
Expand Down
6 changes: 3 additions & 3 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1475,7 +1475,7 @@ def virtualfile_from_vectors(
# 2 columns contains coordinates like longitude, latitude, or datetime string
# types.
for col, array in enumerate(arrays[2:]):
if pd.api.types.is_string_dtype(array.dtype):
if np.issubdtype(array.dtype, np.str_):
columns = col + 2
break

Expand Down Expand Up @@ -1506,9 +1506,9 @@ def virtualfile_from_vectors(
strings = string_arrays[0]
elif len(string_arrays) > 1:
strings = np.array(
[" ".join(vals) for vals in zip(*string_arrays, strict=True)]
[" ".join(vals) for vals in zip(*string_arrays, strict=True)],
dtype=np.str_,
)
strings = np.asanyarray(a=strings, dtype=np.str_)
self.put_strings(
dataset, family="GMT_IS_VECTOR|GMT_IS_DUPLICATE", strings=strings
)
Expand Down
6 changes: 3 additions & 3 deletions pygmt/src/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections.abc import Sequence

import numpy as np
from pygmt._typing import AnchorCode
from pygmt._typing import AnchorCode, StringArrayTypes
from pygmt.clib import Session
from pygmt.exceptions import GMTInvalidInput
from pygmt.helpers import (
Expand Down Expand Up @@ -48,7 +48,7 @@ def text_( # noqa: PLR0912
x=None,
y=None,
position: AnchorCode | None = None,
text=None,
text: str | StringArrayTypes | None = None,
angle=None,
font=None,
justify: bool | None | AnchorCode | Sequence[AnchorCode] = None,
Expand Down Expand Up @@ -104,7 +104,7 @@ def text_( # noqa: PLR0912
For example, ``position="TL"`` plots the text at the Top Left corner
of the map.
text : str or 1-D array
text
The text string, or an array of strings to plot on the figure.
angle: float, str, bool or list
Set the angle measured in degrees counter-clockwise from
Expand Down
122 changes: 120 additions & 2 deletions pygmt/tests/test_clib_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import sys
from datetime import date, datetime

import numpy as np
import numpy.testing as npt
Expand Down Expand Up @@ -52,11 +53,12 @@ def _check_result(result, expected_dtype):
np.complex128,
id="complex",
),
pytest.param(["abc", "defg", "12345"], np.str_, id="string"),
],
)
def test_to_numpy_python_types_numeric(data, expected_dtype):
def test_to_numpy_python_types(data, expected_dtype):
"""
Test the _to_numpy function with Python built-in numeric types.
Test the _to_numpy function with Python built-in types.
"""
result = _to_numpy(data)
_check_result(result, expected_dtype)
Expand Down Expand Up @@ -125,6 +127,17 @@ def test_to_numpy_numpy_numeric(dtype, expected_dtype):
npt.assert_array_equal(result, array, strict=True)


@pytest.mark.parametrize("dtype", [None, np.str_, "U10"])
def test_to_numpy_ndarray_numpy_dtypes_string(dtype):
"""
Test the _to_numpy function with NumPy arrays of string types.
"""
array = np.array(["abc", "defg", "12345"], dtype=dtype)
result = _to_numpy(array)
_check_result(result, np.str_)
npt.assert_array_equal(result, array)


########################################################################################
# Test the _to_numpy function with pandas.Series.
#
Expand Down Expand Up @@ -249,6 +262,53 @@ def test_to_numpy_pandas_numeric_with_na(dtype, expected_dtype):
npt.assert_array_equal(result, np.array([1.0, np.nan, 5.0], dtype=expected_dtype))


@pytest.mark.parametrize(
"dtype",
[
None,
np.str_,
"U10",
"string[python]",
pytest.param("string[pyarrow]", marks=skip_if_no(package="pyarrow")),
pytest.param("string[pyarrow_numpy]", marks=skip_if_no(package="pyarrow")),
],
)
def test_to_numpy_pandas_series_pandas_dtypes_string(dtype):
"""
Test the _to_numpy function with pandas.Series of pandas string types.
In pandas, string arrays can be specified in multiple ways.
Reference: https://pandas.pydata.org/docs/reference/api/pandas.StringDtype.html
"""
array = pd.Series(["abc", "defg", "12345"], dtype=dtype)
result = _to_numpy(array)
_check_result(result, np.str_)
npt.assert_array_equal(result, array)


@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed")
@pytest.mark.parametrize(
("dtype", "expected_dtype"),
[
pytest.param("date32[day][pyarrow]", "datetime64[D]", id="date32[day]"),
pytest.param("date64[ms][pyarrow]", "datetime64[ms]", id="date64[ms]"),
],
)
def test_to_numpy_pandas_series_pyarrow_dtypes_date(dtype, expected_dtype):
"""
Test the _to_numpy function with pandas.Series of PyArrow date32/date64 types.
"""
series = pd.Series(pd.date_range(start="2024-01-01", periods=3), dtype=dtype)
result = _to_numpy(series)
_check_result(result, np.datetime64)
assert result.dtype == expected_dtype # Explicitly check the date unit.
npt.assert_array_equal(
result,
np.array(["2024-01-01", "2024-01-02", "2024-01-03"], dtype=expected_dtype),
)


########################################################################################
# Test the _to_numpy function with PyArrow arrays.
#
Expand All @@ -258,6 +318,10 @@ def test_to_numpy_pandas_numeric_with_na(dtype, expected_dtype):
# - int8, int16, int32, int64
# - uint8, uint16, uint32, uint64
# - float16, float32, float64
# - String types: string/utf8, large_string/large_utf8, string_view
# - Date types:
# - date32[day]
# - date64[ms]
#
# In PyArrow, array types can be specified in two ways:
#
Expand Down Expand Up @@ -326,3 +390,57 @@ def test_to_numpy_pyarrow_numeric_with_na(dtype, expected_dtype):
result = _to_numpy(array)
_check_result(result, expected_dtype)
npt.assert_array_equal(result, array)


@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed")
@pytest.mark.parametrize(
"dtype",
[
None,
"string",
"utf8", # alias for string
"large_string",
"large_utf8", # alias for large_string
"string_view",
],
)
def test_to_numpy_pyarrow_array_pyarrow_dtypes_string(dtype):
"""
Test the _to_numpy function with PyArrow arrays of PyArrow string types.
"""
array = pa.array(["abc", "defg", "12345"], type=dtype)
result = _to_numpy(array)
_check_result(result, np.str_)
npt.assert_array_equal(result, array)


@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed")
@pytest.mark.parametrize(
("dtype", "expected_dtype"),
[
pytest.param("date32[day]", "datetime64[D]", id="date32[day]"),
pytest.param("date64[ms]", "datetime64[ms]", id="date64[ms]"),
],
)
def test_to_numpy_pyarrow_array_pyarrow_dtypes_date(dtype, expected_dtype):
"""
Test the _to_numpy function with PyArrow arrays of PyArrow date types.
date32[day] and date64[ms] are stored as 32-bit and 64-bit integers, respectively,
representing the number of days and milliseconds since the UNIX epoch (1970-01-01).
Here we explicitly check the dtype and date unit of the result.
"""
data = [
date(2024, 1, 1),
datetime(2024, 1, 2),
datetime(2024, 1, 3),
]
array = pa.array(data, type=dtype)
result = _to_numpy(array)
_check_result(result, np.datetime64)
assert result.dtype == expected_dtype # Explicitly check the date unit.
npt.assert_array_equal(
result,
np.array(["2024-01-01", "2024-01-02", "2024-01-03"], dtype=expected_dtype),
)
Loading

0 comments on commit cdf7c38

Please sign in to comment.