Merge branch 'main' into to_numpy/pandas_numeric

GenericMappingTools · Nov 15, 2024 · cdf7c38 · cdf7c38
2 parents 4061ec7 + 3d08919
commit cdf7c38
Show file tree

Hide file tree

Showing 17 changed files with 215 additions and 36 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -45,7 +45,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/cache_data.yaml b/.github/workflows/cache_data.yaml
@@ -43,7 +43,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/ci_docs.yml b/.github/workflows/ci_docs.yml
@@ -80,7 +80,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/ci_doctests.yaml b/.github/workflows/ci_doctests.yaml
@@ -42,7 +42,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/ci_tests.yaml b/.github/workflows/ci_tests.yaml
@@ -114,7 +114,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/ci_tests_dev.yaml b/.github/workflows/ci_tests_dev.yaml
@@ -57,7 +57,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/ci_tests_legacy.yaml b/.github/workflows/ci_tests_legacy.yaml
@@ -51,7 +51,7 @@ jobs:
 
       # Install Micromamba with conda-forge dependencies
       - name: Setup Micromamba
-        uses: mamba-org/[email protected].0
+        uses: mamba-org/[email protected].1
         with:
           environment-name: pygmt
           condarc: |

diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
@@ -75,10 +75,10 @@ jobs:
         ls -lh dist/
 
     - name: Publish to Test PyPI
-      uses: pypa/[email protected].0
+      uses: pypa/[email protected].2
       with:
         repository-url: https://test.pypi.org/legacy/
 
     - name: Publish to PyPI
       if: startsWith(github.ref, 'refs/tags')
-      uses: pypa/[email protected].0
+      uses: pypa/[email protected].2
diff --git a/doc/conf.py b/doc/conf.py
@@ -85,6 +85,7 @@
     "contextily": ("https://contextily.readthedocs.io/en/stable/", None),
     "geopandas": ("https://geopandas.org/en/stable/", None),
     "numpy": ("https://numpy.org/doc/stable/", None),
+    "pyarrow": ("https://arrow.apache.org/docs/", None),
     "python": ("https://docs.python.org/3/", None),
     "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
     "rasterio": ("https://rasterio.readthedocs.io/en/stable/", None),

diff --git a/doc/ecosystem.md b/doc/ecosystem.md
@@ -94,9 +94,9 @@ Python objects. They are based on the C++ implementation of Arrow.
 ```{note}
 If you have [PyArrow][] installed, PyGMT does have some initial support for
 `pandas.Series` and `pandas.DataFrame` objects with Apache Arrow-backed arrays.
-Specifically, only uint/int/float and date32/date64 are supported for now.
-Support for string Array dtypes, Duration types and GeoArrow geometry types is still a work in progress.
-For more details, see
+Specifically, only uint/int/float, date32/date64 and string types are supported for now.
+Support for Duration types and GeoArrow geometry types is still a work in progress. For
+more details, see
 [issue #2800](https://github.com/GenericMappingTools/pygmt/issues/2800).
 ```
 

diff --git a/pygmt/_typing.py b/pygmt/_typing.py
@@ -2,7 +2,17 @@
 Type aliases for type hints.
 """
 
+import contextlib
+import importlib
+from collections.abc import Sequence
 from typing import Literal
 
+import numpy as np
+
 # Anchor codes
 AnchorCode = Literal["TL", "TC", "TR", "ML", "MC", "MR", "BL", "BC", "BR"]
+
+# String array types
+StringArrayTypes = Sequence[str] | np.ndarray
+with contextlib.suppress(ImportError):
+    StringArrayTypes |= importlib.import_module(name="pyarrow").StringArray
diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py
@@ -2,6 +2,7 @@
 Functions to convert data types into ctypes friendly formats.
 """
 
+import contextlib
 import ctypes as ctp
 import warnings
 from collections.abc import Sequence
@@ -156,14 +157,19 @@ def _to_numpy(data: Any) -> np.ndarray:
     array
         The C contiguous NumPy array.
     """
-    # Mapping of unsupported dtypes to the expected NumPy dtype.
-    dtypes: dict[str, type] = {
-        "date32[day][pyarrow]": np.datetime64,
-        "date64[ms][pyarrow]": np.datetime64,
+    # Mapping of unsupported dtypes to expected NumPy dtypes.
+    dtypes: dict[str, type | str] = {
+        # For string dtypes.
+        "large_string": np.str_,  # pa.large_string and pa.large_utf8
+        "string": np.str_,  # pa.string, pa.utf8, pd.StringDtype
+        "string_view": np.str_,  # pa.string_view
+        # For datetime dtypes.
+        "date32[day][pyarrow]": "datetime64[D]",
+        "date64[ms][pyarrow]": "datetime64[ms]",
     }
 
     # The expected numpy dtype for the result numpy array, but can be None.
-    dtype = dtypes.get(str(getattr(data, "dtype", None)))
+    dtype = dtypes.get(str(getattr(data, "dtype", getattr(data, "type", None))))
 
     # Workarounds for pandas < 2.2. Following SPEC 0, pandas 2.1 should be dropped in
     # 2025 Q3, so it's likely we can remove the workaround in PyGMT v0.17.0.
@@ -184,6 +190,12 @@ def _to_numpy(data: Any) -> np.ndarray:
             data = data.to_numpy(na_value=np.nan)
 
     array = np.ascontiguousarray(data, dtype=dtype)
+
+    # Check if a np.object_ array can be converted to np.str_.
+    if array.dtype == np.object_:
+        with contextlib.suppress(TypeError, ValueError):
+            return np.ascontiguousarray(array, dtype=np.str_)
+
     return array
 
 
@@ -284,12 +296,13 @@ def sequence_to_ctypes_array(
 
 def strings_to_ctypes_array(strings: Sequence[str] | np.ndarray) -> ctp.Array:
     """
-    Convert a sequence (e.g., a list) of strings into a ctypes array.
+    Convert a sequence (e.g., a list) of strings or numpy.ndarray of strings into a
+    ctypes array.
 
     Parameters
     ----------
     strings
-        A sequence of strings.
+        A sequence of strings, or a numpy.ndarray of str dtype.
 
     Returns
     -------

diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py
@@ -1475,7 +1475,7 @@ def virtualfile_from_vectors(
         # 2 columns contains coordinates like longitude, latitude, or datetime string
         # types.
         for col, array in enumerate(arrays[2:]):
-            if pd.api.types.is_string_dtype(array.dtype):
+            if np.issubdtype(array.dtype, np.str_):
                 columns = col + 2
                 break
 
@@ -1506,9 +1506,9 @@ def virtualfile_from_vectors(
                 strings = string_arrays[0]
             elif len(string_arrays) > 1:
                 strings = np.array(
-                    [" ".join(vals) for vals in zip(*string_arrays, strict=True)]
+                    [" ".join(vals) for vals in zip(*string_arrays, strict=True)],
+                    dtype=np.str_,
                 )
-            strings = np.asanyarray(a=strings, dtype=np.str_)
             self.put_strings(
                 dataset, family="GMT_IS_VECTOR|GMT_IS_DUPLICATE", strings=strings
             )

diff --git a/pygmt/src/text.py b/pygmt/src/text.py
@@ -5,7 +5,7 @@
 from collections.abc import Sequence
 
 import numpy as np
-from pygmt._typing import AnchorCode
+from pygmt._typing import AnchorCode, StringArrayTypes
 from pygmt.clib import Session
 from pygmt.exceptions import GMTInvalidInput
 from pygmt.helpers import (
@@ -48,7 +48,7 @@ def text_(  # noqa: PLR0912
     x=None,
     y=None,
     position: AnchorCode | None = None,
-    text=None,
+    text: str | StringArrayTypes | None = None,
     angle=None,
     font=None,
     justify: bool | None | AnchorCode | Sequence[AnchorCode] = None,
@@ -104,7 +104,7 @@ def text_(  # noqa: PLR0912
 
         For example, ``position="TL"`` plots the text at the Top Left corner
         of the map.
-    text : str or 1-D array
+    text
         The text string, or an array of strings to plot on the figure.
     angle: float, str, bool or list
         Set the angle measured in degrees counter-clockwise from

diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py
@@ -3,6 +3,7 @@
 """
 
 import sys
+from datetime import date, datetime
 
 import numpy as np
 import numpy.testing as npt
@@ -52,11 +53,12 @@ def _check_result(result, expected_dtype):
             np.complex128,
             id="complex",
         ),
+        pytest.param(["abc", "defg", "12345"], np.str_, id="string"),
     ],
 )
-def test_to_numpy_python_types_numeric(data, expected_dtype):
+def test_to_numpy_python_types(data, expected_dtype):
     """
-    Test the _to_numpy function with Python built-in numeric types.
+    Test the _to_numpy function with Python built-in types.
     """
     result = _to_numpy(data)
     _check_result(result, expected_dtype)
@@ -125,6 +127,17 @@ def test_to_numpy_numpy_numeric(dtype, expected_dtype):
     npt.assert_array_equal(result, array, strict=True)
 
 
+@pytest.mark.parametrize("dtype", [None, np.str_, "U10"])
+def test_to_numpy_ndarray_numpy_dtypes_string(dtype):
+    """
+    Test the _to_numpy function with NumPy arrays of string types.
+    """
+    array = np.array(["abc", "defg", "12345"], dtype=dtype)
+    result = _to_numpy(array)
+    _check_result(result, np.str_)
+    npt.assert_array_equal(result, array)
+
+
 ########################################################################################
 # Test the _to_numpy function with pandas.Series.
 #
@@ -249,6 +262,53 @@ def test_to_numpy_pandas_numeric_with_na(dtype, expected_dtype):
     npt.assert_array_equal(result, np.array([1.0, np.nan, 5.0], dtype=expected_dtype))
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        None,
+        np.str_,
+        "U10",
+        "string[python]",
+        pytest.param("string[pyarrow]", marks=skip_if_no(package="pyarrow")),
+        pytest.param("string[pyarrow_numpy]", marks=skip_if_no(package="pyarrow")),
+    ],
+)
+def test_to_numpy_pandas_series_pandas_dtypes_string(dtype):
+    """
+    Test the _to_numpy function with pandas.Series of pandas string types.
+
+    In pandas, string arrays can be specified in multiple ways.
+
+    Reference: https://pandas.pydata.org/docs/reference/api/pandas.StringDtype.html
+    """
+    array = pd.Series(["abc", "defg", "12345"], dtype=dtype)
+    result = _to_numpy(array)
+    _check_result(result, np.str_)
+    npt.assert_array_equal(result, array)
+
+
+@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed")
+@pytest.mark.parametrize(
+    ("dtype", "expected_dtype"),
+    [
+        pytest.param("date32[day][pyarrow]", "datetime64[D]", id="date32[day]"),
+        pytest.param("date64[ms][pyarrow]", "datetime64[ms]", id="date64[ms]"),
+    ],
+)
+def test_to_numpy_pandas_series_pyarrow_dtypes_date(dtype, expected_dtype):
+    """
+    Test the _to_numpy function with pandas.Series of PyArrow date32/date64 types.
+    """
+    series = pd.Series(pd.date_range(start="2024-01-01", periods=3), dtype=dtype)
+    result = _to_numpy(series)
+    _check_result(result, np.datetime64)
+    assert result.dtype == expected_dtype  # Explicitly check the date unit.
+    npt.assert_array_equal(
+        result,
+        np.array(["2024-01-01", "2024-01-02", "2024-01-03"], dtype=expected_dtype),
+    )
+
+
 ########################################################################################
 # Test the _to_numpy function with PyArrow arrays.
 #
@@ -258,6 +318,10 @@ def test_to_numpy_pandas_numeric_with_na(dtype, expected_dtype):
 #   - int8, int16, int32, int64
 #   - uint8, uint16, uint32, uint64
 #   - float16, float32, float64
+# - String types: string/utf8, large_string/large_utf8, string_view
+# - Date types:
+#   - date32[day]
+#   - date64[ms]
 #
 # In PyArrow, array types can be specified in two ways:
 #
@@ -326,3 +390,57 @@ def test_to_numpy_pyarrow_numeric_with_na(dtype, expected_dtype):
     result = _to_numpy(array)
     _check_result(result, expected_dtype)
     npt.assert_array_equal(result, array)
+
+
+@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed")
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        None,
+        "string",
+        "utf8",  # alias for string
+        "large_string",
+        "large_utf8",  # alias for large_string
+        "string_view",
+    ],
+)
+def test_to_numpy_pyarrow_array_pyarrow_dtypes_string(dtype):
+    """
+    Test the _to_numpy function with PyArrow arrays of PyArrow string types.
+    """
+    array = pa.array(["abc", "defg", "12345"], type=dtype)
+    result = _to_numpy(array)
+    _check_result(result, np.str_)
+    npt.assert_array_equal(result, array)
+
+
+@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed")
+@pytest.mark.parametrize(
+    ("dtype", "expected_dtype"),
+    [
+        pytest.param("date32[day]", "datetime64[D]", id="date32[day]"),
+        pytest.param("date64[ms]", "datetime64[ms]", id="date64[ms]"),
+    ],
+)
+def test_to_numpy_pyarrow_array_pyarrow_dtypes_date(dtype, expected_dtype):
+    """
+    Test the _to_numpy function with PyArrow arrays of PyArrow date types.
+
+    date32[day] and date64[ms] are stored as 32-bit and 64-bit integers, respectively,
+    representing the number of days and milliseconds since the UNIX epoch (1970-01-01).
+
+    Here we explicitly check the dtype and date unit of the result.
+    """
+    data = [
+        date(2024, 1, 1),
+        datetime(2024, 1, 2),
+        datetime(2024, 1, 3),
+    ]
+    array = pa.array(data, type=dtype)
+    result = _to_numpy(array)
+    _check_result(result, np.datetime64)
+    assert result.dtype == expected_dtype  # Explicitly check the date unit.
+    npt.assert_array_equal(
+        result,
+        np.array(["2024-01-01", "2024-01-02", "2024-01-03"], dtype=expected_dtype),
+    )