diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index f8f6b9d6bb0..ccb22dbe5a2 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -45,7 +45,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/cache_data.yaml b/.github/workflows/cache_data.yaml index 0c61d35b15c..e9b829dc4a2 100644 --- a/.github/workflows/cache_data.yaml +++ b/.github/workflows/cache_data.yaml @@ -43,7 +43,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/ci_docs.yml b/.github/workflows/ci_docs.yml index 69987e9522e..4e85e10dae5 100644 --- a/.github/workflows/ci_docs.yml +++ b/.github/workflows/ci_docs.yml @@ -80,7 +80,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/ci_doctests.yaml b/.github/workflows/ci_doctests.yaml index 0c5445dfd87..9f9f874b30b 100644 --- a/.github/workflows/ci_doctests.yaml +++ b/.github/workflows/ci_doctests.yaml @@ -42,7 +42,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/ci_tests.yaml b/.github/workflows/ci_tests.yaml index fb002c0d145..68571f14135 100644 --- a/.github/workflows/ci_tests.yaml +++ b/.github/workflows/ci_tests.yaml @@ -114,7 +114,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/ci_tests_dev.yaml b/.github/workflows/ci_tests_dev.yaml index c9c978e64b9..4491a2ec903 100644 --- a/.github/workflows/ci_tests_dev.yaml +++ b/.github/workflows/ci_tests_dev.yaml @@ -57,7 +57,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/ci_tests_legacy.yaml b/.github/workflows/ci_tests_legacy.yaml index 4b396125afd..5ac5dcedbbc 100644 --- a/.github/workflows/ci_tests_legacy.yaml +++ b/.github/workflows/ci_tests_legacy.yaml @@ -51,7 +51,7 @@ jobs: # Install Micromamba with conda-forge dependencies - name: Setup Micromamba - uses: mamba-org/setup-micromamba@v2.0.0 + uses: mamba-org/setup-micromamba@v2.0.1 with: environment-name: pygmt condarc: | diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index b04619fccf8..b9994d873e9 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -75,10 +75,10 @@ jobs: ls -lh dist/ - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@v1.12.0 + uses: pypa/gh-action-pypi-publish@v1.12.2 with: repository-url: https://test.pypi.org/legacy/ - name: Publish to PyPI if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@v1.12.0 + uses: pypa/gh-action-pypi-publish@v1.12.2 diff --git a/doc/conf.py b/doc/conf.py index 1586601804d..613c860aa75 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -85,6 +85,7 @@ "contextily": ("https://contextily.readthedocs.io/en/stable/", None), "geopandas": ("https://geopandas.org/en/stable/", None), "numpy": ("https://numpy.org/doc/stable/", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), "python": ("https://docs.python.org/3/", None), "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), "rasterio": ("https://rasterio.readthedocs.io/en/stable/", None), diff --git a/doc/ecosystem.md b/doc/ecosystem.md index 3e265c2c5eb..0f43835d7d3 100644 --- a/doc/ecosystem.md +++ b/doc/ecosystem.md @@ -94,9 +94,9 @@ Python objects. They are based on the C++ implementation of Arrow. ```{note} If you have [PyArrow][] installed, PyGMT does have some initial support for `pandas.Series` and `pandas.DataFrame` objects with Apache Arrow-backed arrays. -Specifically, only uint/int/float and date32/date64 are supported for now. -Support for string Array dtypes, Duration types and GeoArrow geometry types is still a work in progress. -For more details, see +Specifically, only uint/int/float, date32/date64 and string types are supported for now. +Support for Duration types and GeoArrow geometry types is still a work in progress. For +more details, see [issue #2800](https://github.com/GenericMappingTools/pygmt/issues/2800). ``` diff --git a/pygmt/_typing.py b/pygmt/_typing.py index bbc7d596c65..4a57c3c7678 100644 --- a/pygmt/_typing.py +++ b/pygmt/_typing.py @@ -2,7 +2,17 @@ Type aliases for type hints. """ +import contextlib +import importlib +from collections.abc import Sequence from typing import Literal +import numpy as np + # Anchor codes AnchorCode = Literal["TL", "TC", "TR", "ML", "MC", "MR", "BL", "BC", "BR"] + +# String array types +StringArrayTypes = Sequence[str] | np.ndarray +with contextlib.suppress(ImportError): + StringArrayTypes |= importlib.import_module(name="pyarrow").StringArray diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index e7817083fb7..390b731969d 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -2,6 +2,7 @@ Functions to convert data types into ctypes friendly formats. """ +import contextlib import ctypes as ctp import warnings from collections.abc import Sequence @@ -156,14 +157,19 @@ def _to_numpy(data: Any) -> np.ndarray: array The C contiguous NumPy array. """ - # Mapping of unsupported dtypes to the expected NumPy dtype. - dtypes: dict[str, type] = { - "date32[day][pyarrow]": np.datetime64, - "date64[ms][pyarrow]": np.datetime64, + # Mapping of unsupported dtypes to expected NumPy dtypes. + dtypes: dict[str, type | str] = { + # For string dtypes. + "large_string": np.str_, # pa.large_string and pa.large_utf8 + "string": np.str_, # pa.string, pa.utf8, pd.StringDtype + "string_view": np.str_, # pa.string_view + # For datetime dtypes. + "date32[day][pyarrow]": "datetime64[D]", + "date64[ms][pyarrow]": "datetime64[ms]", } # The expected numpy dtype for the result numpy array, but can be None. - dtype = dtypes.get(str(getattr(data, "dtype", None))) + dtype = dtypes.get(str(getattr(data, "dtype", getattr(data, "type", None)))) # Workarounds for pandas < 2.2. Following SPEC 0, pandas 2.1 should be dropped in # 2025 Q3, so it's likely we can remove the workaround in PyGMT v0.17.0. @@ -184,6 +190,12 @@ def _to_numpy(data: Any) -> np.ndarray: data = data.to_numpy(na_value=np.nan) array = np.ascontiguousarray(data, dtype=dtype) + + # Check if a np.object_ array can be converted to np.str_. + if array.dtype == np.object_: + with contextlib.suppress(TypeError, ValueError): + return np.ascontiguousarray(array, dtype=np.str_) + return array @@ -284,12 +296,13 @@ def sequence_to_ctypes_array( def strings_to_ctypes_array(strings: Sequence[str] | np.ndarray) -> ctp.Array: """ - Convert a sequence (e.g., a list) of strings into a ctypes array. + Convert a sequence (e.g., a list) of strings or numpy.ndarray of strings into a + ctypes array. Parameters ---------- strings - A sequence of strings. + A sequence of strings, or a numpy.ndarray of str dtype. Returns ------- diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py index 10c8770adaa..69922edc6dc 100644 --- a/pygmt/clib/session.py +++ b/pygmt/clib/session.py @@ -1475,7 +1475,7 @@ def virtualfile_from_vectors( # 2 columns contains coordinates like longitude, latitude, or datetime string # types. for col, array in enumerate(arrays[2:]): - if pd.api.types.is_string_dtype(array.dtype): + if np.issubdtype(array.dtype, np.str_): columns = col + 2 break @@ -1506,9 +1506,9 @@ def virtualfile_from_vectors( strings = string_arrays[0] elif len(string_arrays) > 1: strings = np.array( - [" ".join(vals) for vals in zip(*string_arrays, strict=True)] + [" ".join(vals) for vals in zip(*string_arrays, strict=True)], + dtype=np.str_, ) - strings = np.asanyarray(a=strings, dtype=np.str_) self.put_strings( dataset, family="GMT_IS_VECTOR|GMT_IS_DUPLICATE", strings=strings ) diff --git a/pygmt/src/text.py b/pygmt/src/text.py index 2ed475c9ac2..ad98711824b 100644 --- a/pygmt/src/text.py +++ b/pygmt/src/text.py @@ -5,7 +5,7 @@ from collections.abc import Sequence import numpy as np -from pygmt._typing import AnchorCode +from pygmt._typing import AnchorCode, StringArrayTypes from pygmt.clib import Session from pygmt.exceptions import GMTInvalidInput from pygmt.helpers import ( @@ -48,7 +48,7 @@ def text_( # noqa: PLR0912 x=None, y=None, position: AnchorCode | None = None, - text=None, + text: str | StringArrayTypes | None = None, angle=None, font=None, justify: bool | None | AnchorCode | Sequence[AnchorCode] = None, @@ -104,7 +104,7 @@ def text_( # noqa: PLR0912 For example, ``position="TL"`` plots the text at the Top Left corner of the map. - text : str or 1-D array + text The text string, or an array of strings to plot on the figure. angle: float, str, bool or list Set the angle measured in degrees counter-clockwise from diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py index 98808f24929..16ff9b2ba70 100644 --- a/pygmt/tests/test_clib_to_numpy.py +++ b/pygmt/tests/test_clib_to_numpy.py @@ -3,6 +3,7 @@ """ import sys +from datetime import date, datetime import numpy as np import numpy.testing as npt @@ -52,11 +53,12 @@ def _check_result(result, expected_dtype): np.complex128, id="complex", ), + pytest.param(["abc", "defg", "12345"], np.str_, id="string"), ], ) -def test_to_numpy_python_types_numeric(data, expected_dtype): +def test_to_numpy_python_types(data, expected_dtype): """ - Test the _to_numpy function with Python built-in numeric types. + Test the _to_numpy function with Python built-in types. """ result = _to_numpy(data) _check_result(result, expected_dtype) @@ -125,6 +127,17 @@ def test_to_numpy_numpy_numeric(dtype, expected_dtype): npt.assert_array_equal(result, array, strict=True) +@pytest.mark.parametrize("dtype", [None, np.str_, "U10"]) +def test_to_numpy_ndarray_numpy_dtypes_string(dtype): + """ + Test the _to_numpy function with NumPy arrays of string types. + """ + array = np.array(["abc", "defg", "12345"], dtype=dtype) + result = _to_numpy(array) + _check_result(result, np.str_) + npt.assert_array_equal(result, array) + + ######################################################################################## # Test the _to_numpy function with pandas.Series. # @@ -249,6 +262,53 @@ def test_to_numpy_pandas_numeric_with_na(dtype, expected_dtype): npt.assert_array_equal(result, np.array([1.0, np.nan, 5.0], dtype=expected_dtype)) +@pytest.mark.parametrize( + "dtype", + [ + None, + np.str_, + "U10", + "string[python]", + pytest.param("string[pyarrow]", marks=skip_if_no(package="pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=skip_if_no(package="pyarrow")), + ], +) +def test_to_numpy_pandas_series_pandas_dtypes_string(dtype): + """ + Test the _to_numpy function with pandas.Series of pandas string types. + + In pandas, string arrays can be specified in multiple ways. + + Reference: https://pandas.pydata.org/docs/reference/api/pandas.StringDtype.html + """ + array = pd.Series(["abc", "defg", "12345"], dtype=dtype) + result = _to_numpy(array) + _check_result(result, np.str_) + npt.assert_array_equal(result, array) + + +@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed") +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + pytest.param("date32[day][pyarrow]", "datetime64[D]", id="date32[day]"), + pytest.param("date64[ms][pyarrow]", "datetime64[ms]", id="date64[ms]"), + ], +) +def test_to_numpy_pandas_series_pyarrow_dtypes_date(dtype, expected_dtype): + """ + Test the _to_numpy function with pandas.Series of PyArrow date32/date64 types. + """ + series = pd.Series(pd.date_range(start="2024-01-01", periods=3), dtype=dtype) + result = _to_numpy(series) + _check_result(result, np.datetime64) + assert result.dtype == expected_dtype # Explicitly check the date unit. + npt.assert_array_equal( + result, + np.array(["2024-01-01", "2024-01-02", "2024-01-03"], dtype=expected_dtype), + ) + + ######################################################################################## # Test the _to_numpy function with PyArrow arrays. # @@ -258,6 +318,10 @@ def test_to_numpy_pandas_numeric_with_na(dtype, expected_dtype): # - int8, int16, int32, int64 # - uint8, uint16, uint32, uint64 # - float16, float32, float64 +# - String types: string/utf8, large_string/large_utf8, string_view +# - Date types: +# - date32[day] +# - date64[ms] # # In PyArrow, array types can be specified in two ways: # @@ -326,3 +390,57 @@ def test_to_numpy_pyarrow_numeric_with_na(dtype, expected_dtype): result = _to_numpy(array) _check_result(result, expected_dtype) npt.assert_array_equal(result, array) + + +@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed") +@pytest.mark.parametrize( + "dtype", + [ + None, + "string", + "utf8", # alias for string + "large_string", + "large_utf8", # alias for large_string + "string_view", + ], +) +def test_to_numpy_pyarrow_array_pyarrow_dtypes_string(dtype): + """ + Test the _to_numpy function with PyArrow arrays of PyArrow string types. + """ + array = pa.array(["abc", "defg", "12345"], type=dtype) + result = _to_numpy(array) + _check_result(result, np.str_) + npt.assert_array_equal(result, array) + + +@pytest.mark.skipif(not _HAS_PYARROW, reason="pyarrow is not installed") +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + pytest.param("date32[day]", "datetime64[D]", id="date32[day]"), + pytest.param("date64[ms]", "datetime64[ms]", id="date64[ms]"), + ], +) +def test_to_numpy_pyarrow_array_pyarrow_dtypes_date(dtype, expected_dtype): + """ + Test the _to_numpy function with PyArrow arrays of PyArrow date types. + + date32[day] and date64[ms] are stored as 32-bit and 64-bit integers, respectively, + representing the number of days and milliseconds since the UNIX epoch (1970-01-01). + + Here we explicitly check the dtype and date unit of the result. + """ + data = [ + date(2024, 1, 1), + datetime(2024, 1, 2), + datetime(2024, 1, 3), + ] + array = pa.array(data, type=dtype) + result = _to_numpy(array) + _check_result(result, np.datetime64) + assert result.dtype == expected_dtype # Explicitly check the date unit. + npt.assert_array_equal( + result, + np.array(["2024-01-01", "2024-01-02", "2024-01-03"], dtype=expected_dtype), + ) diff --git a/pygmt/tests/test_clib_virtualfile_from_vectors.py b/pygmt/tests/test_clib_virtualfile_from_vectors.py index 041bc7a803c..b76a9bfe168 100644 --- a/pygmt/tests/test_clib_virtualfile_from_vectors.py +++ b/pygmt/tests/test_clib_virtualfile_from_vectors.py @@ -11,6 +11,14 @@ from pygmt.clib.session import DTYPES_NUMERIC from pygmt.exceptions import GMTInvalidInput from pygmt.helpers import GMTTempFile +from pygmt.helpers.testing import skip_if_no + +try: + import pyarrow as pa + + pa_array = pa.array +except ImportError: + pa_array = None @pytest.fixture(scope="module", name="dtypes") @@ -53,17 +61,30 @@ def test_virtualfile_from_vectors(dtypes): @pytest.mark.benchmark -@pytest.mark.parametrize("dtype", [str, object]) -def test_virtualfile_from_vectors_one_string_or_object_column(dtype): - """ - Test passing in one column with string or object dtype into virtual file dataset. +@pytest.mark.parametrize( + ("array_func", "dtype"), + [ + pytest.param(np.array, {"dtype": np.str_}, id="str"), + pytest.param(np.array, {"dtype": np.object_}, id="object"), + pytest.param( + pa_array, + {}, # {"type": pa.string()} + marks=skip_if_no(package="pyarrow"), + id="pyarrow", + ), + ], +) +def test_virtualfile_from_vectors_one_string_or_object_column(array_func, dtype): + """ + Test passing in one column with string (numpy/pyarrow) or object (numpy) + dtype into virtual file dataset. """ size = 5 x = np.arange(size, dtype=np.int32) y = np.arange(size, size * 2, 1, dtype=np.int32) - strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=dtype) + strings = array_func(["a", "bc", "defg", "hijklmn", "opqrst"], **dtype) with clib.Session() as lib: - with lib.virtualfile_from_vectors((x, y, strings)) as vfile: + with lib.virtualfile_from_vectors(vectors=(x, y, strings)) as vfile: with GMTTempFile() as outfile: lib.call_module("convert", [vfile, f"->{outfile.name}"]) output = outfile.read(keep_tabs=True) diff --git a/pygmt/tests/test_text.py b/pygmt/tests/test_text.py index 64781c514bc..593c07a7b4d 100644 --- a/pygmt/tests/test_text.py +++ b/pygmt/tests/test_text.py @@ -9,6 +9,14 @@ from pygmt import Figure from pygmt.exceptions import GMTCLibError, GMTInvalidInput from pygmt.helpers import GMTTempFile +from pygmt.helpers.testing import skip_if_no + +try: + import pyarrow as pa + + pa_array = pa.array +except ImportError: + pa_array = None TEST_DATA_DIR = Path(__file__).parent / "data" POINTS_DATA = TEST_DATA_DIR / "points.txt" @@ -48,8 +56,16 @@ def test_text_single_line_of_text(region, projection): @pytest.mark.benchmark -@pytest.mark.mpl_image_compare -def test_text_multiple_lines_of_text(region, projection): +@pytest.mark.mpl_image_compare(filename="test_text_multiple_lines_of_text.png") +@pytest.mark.parametrize( + "array_func", + [ + list, + pytest.param(np.array, id="numpy"), + pytest.param(pa_array, marks=skip_if_no(package="pyarrow"), id="pyarrow"), + ], +) +def test_text_multiple_lines_of_text(region, projection, array_func): """ Place multiple lines of text at their respective x, y locations. """ @@ -59,7 +75,7 @@ def test_text_multiple_lines_of_text(region, projection): projection=projection, x=[1.2, 1.6], y=[0.6, 0.3], - text=["This is a line of text", "This is another line of text"], + text=array_func(["This is a line of text", "This is another line of text"]), ) return fig