From ffce1b635811eb36c2a168d51309db20fc218493 Mon Sep 17 00:00:00 2001 From: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:58:26 +0100 Subject: [PATCH] feat: add `DataFrame` and `Series` `to_polars` (#1803) * feat: add DataFrame and Series to_polars * modin and cudf via pandas --- docs/api-reference/dataframe.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/dataframe.py | 6 +++ narwhals/_arrow/series.py | 6 +++ narwhals/_pandas_like/dataframe.py | 22 ++++++++-- narwhals/_pandas_like/series.py | 22 ++++++++-- narwhals/_polars/dataframe.py | 3 ++ narwhals/_polars/series.py | 3 ++ narwhals/dataframe.py | 67 +++++++++++++++++++++++++++++ narwhals/series.py | 61 +++++++++++++++++++++++++- tests/frame/to_polars_test.py | 23 ++++++++++ tests/series_only/to_polars_test.py | 24 +++++++++++ utils/check_api_reference.py | 1 + 13 files changed, 231 insertions(+), 9 deletions(-) create mode 100644 tests/frame/to_polars_test.py create mode 100644 tests/series_only/to_polars_test.py diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index d937bf287..9e12c389f 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -44,6 +44,7 @@ - to_native - to_numpy - to_pandas + - to_polars - unique - unpivot - with_columns diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 0aea494f7..d3bbd62f6 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -77,6 +77,7 @@ - to_list - to_numpy - to_pandas + - to_polars - to_native - unique - value_counts diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 616887951..ed738647c 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -30,6 +30,7 @@ import numpy as np import pandas as pd + import polars as pl import pyarrow as pa from typing_extensions import Self @@ -427,6 +428,11 @@ def sort( def to_pandas(self: Self) -> pd.DataFrame: return self._native_frame.to_pandas() + def to_polars(self: Self) -> pl.DataFrame: + import polars as pl # ignore-banned-import + + return pl.from_arrow(self._native_frame) # type: ignore[return-value] + def to_numpy(self: Self) -> np.ndarray: import numpy as np # ignore-banned-import diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 656a802ca..873b29975 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -29,6 +29,7 @@ import numpy as np import pandas as pd + import polars as pl import pyarrow as pa from typing_extensions import Self @@ -733,6 +734,11 @@ def to_pandas(self: Self) -> pd.Series: return pd.Series(self._native_series, name=self.name) + def to_polars(self: Self) -> pl.Series: + import polars as pl # ignore-banned-import + + return pl.from_arrow(self._native_series) # type: ignore[return-value] + def is_duplicated(self: Self) -> ArrowSeries: return self.to_frame().is_duplicated().alias(self.name) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 0cff4cb39..47c43a69a 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -35,6 +35,7 @@ import numpy as np import pandas as pd + import polars as pl from typing_extensions import Self from narwhals._pandas_like.group_by import PandasLikeGroupBy @@ -763,12 +764,27 @@ def to_numpy(self, dtype: Any = None, copy: bool | None = None) -> Any: ) return df.to_numpy(copy=copy) - def to_pandas(self) -> Any: + def to_pandas(self: Self) -> pd.DataFrame: if self._implementation is Implementation.PANDAS: return self._native_frame - if self._implementation is Implementation.MODIN: + elif self._implementation is Implementation.CUDF: # pragma: no cover + return self._native_frame.to_pandas() + elif self._implementation is Implementation.MODIN: return self._native_frame._to_pandas() - return self._native_frame.to_pandas() # pragma: no cover + msg = f"Unknown implementation: {self._implementation}" # pragma: no cover + raise AssertionError(msg) + + def to_polars(self: Self) -> pl.DataFrame: + import polars as pl # ignore-banned-import + + if self._implementation is Implementation.PANDAS: + return pl.from_pandas(self._native_frame) + elif self._implementation is Implementation.CUDF: # pragma: no cover + return pl.from_pandas(self._native_frame.to_pandas()) + elif self._implementation is Implementation.MODIN: + return pl.from_pandas(self._native_frame._to_pandas()) + msg = f"Unknown implementation: {self._implementation}" # pragma: no cover + raise AssertionError(msg) def write_parquet(self, file: Any) -> Any: self._native_frame.to_parquet(file) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 139c9a3b0..358041db7 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -29,6 +29,8 @@ if TYPE_CHECKING: from types import ModuleType + import pandas as pd + import polars as pl from typing_extensions import Self from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -303,13 +305,13 @@ def arg_true(self) -> PandasLikeSeries: def arg_min(self) -> int: ser = self._native_series if self._implementation is Implementation.PANDAS and self._backend_version < (1,): - return ser.values.argmin() # type: ignore[no-any-return] + return ser.to_numpy().argmin() # type: ignore[no-any-return] return ser.argmin() # type: ignore[no-any-return] def arg_max(self) -> int: ser = self._native_series if self._implementation is Implementation.PANDAS and self._backend_version < (1,): - return ser.values.argmax() # type: ignore[no-any-return] + return ser.to_numpy().argmax() # type: ignore[no-any-return] return ser.argmax() # type: ignore[no-any-return] # Binary comparisons @@ -837,16 +839,28 @@ def to_numpy(self, dtype: Any = None, copy: bool | None = None) -> Any: ) return s.to_numpy(dtype=dtype, copy=copy) - def to_pandas(self) -> Any: + def to_pandas(self: Self) -> pd.Series: if self._implementation is Implementation.PANDAS: return self._native_series - elif self._implementation is Implementation.CUDF: + elif self._implementation is Implementation.CUDF: # pragma: no cover return self._native_series.to_pandas() elif self._implementation is Implementation.MODIN: return self._native_series._to_pandas() msg = f"Unknown implementation: {self._implementation}" # pragma: no cover raise AssertionError(msg) + def to_polars(self: Self) -> pl.DataFrame: + import polars as pl # ignore-banned-import + + if self._implementation is Implementation.PANDAS: + return pl.from_pandas(self._native_series) + elif self._implementation is Implementation.CUDF: # pragma: no cover + return pl.from_pandas(self._native_series.to_pandas()) + elif self._implementation is Implementation.MODIN: + return pl.from_pandas(self._native_series._to_pandas()) + msg = f"Unknown implementation: {self._implementation}" # pragma: no cover + raise AssertionError(msg) + # --- descriptive --- def is_duplicated(self: Self) -> Self: res = self._native_series.duplicated(keep=False) diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index d5e115284..763a09811 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -332,6 +332,9 @@ def pivot( ) return self._from_native_object(result) + def to_polars(self: Self) -> pl.DataFrame: + return self._native_frame + class PolarsLazyFrame: def __init__( diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 33572db7c..32289632e 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -417,6 +417,9 @@ def __contains__(self: Self, other: Any) -> bool: msg = f"Unable to compare other of type {type(other)} with series of type {self.dtype}." raise InvalidOperationError(msg) from exc + def to_polars(self: Self) -> pl.Series: + return self._native_series + @property def dt(self: Self) -> PolarsSeriesDateTimeNamespace: return PolarsSeriesDateTimeNamespace(self) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index dd786ef3d..1ae43028c 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -30,6 +30,7 @@ import numpy as np import pandas as pd + import polars as pl import pyarrow as pa from typing_extensions import Self @@ -581,6 +582,72 @@ def to_pandas(self) -> pd.DataFrame: """ return self._compliant_frame.to_pandas() + def to_polars(self) -> pl.DataFrame: + """Convert this DataFrame to a polars DataFrame. + + Returns: + A polars DataFrame. + + Examples: + Construct pandas, Polars (eager) and PyArrow DataFrames: + + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoDataFrame + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a library agnostic function: + + >>> def agnostic_to_polars(df_native: IntoDataFrame) -> pl.DataFrame: + ... df = nw.from_native(df_native) + ... return df.to_polars() + + We can then pass any supported library such as pandas, Polars (eager), or + PyArrow to `agnostic_to_polars`: + + >>> agnostic_to_polars(df_pd) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + >>> agnostic_to_polars(df_pl) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + + >>> agnostic_to_polars(df_pa) + shape: (3, 3) + ┌─────┬─────┬─────┐ + │ foo ┆ bar ┆ ham │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ f64 ┆ str │ + ╞═════╪═════╪═════╡ + │ 1 ┆ 6.0 ┆ a │ + │ 2 ┆ 7.0 ┆ b │ + │ 3 ┆ 8.0 ┆ c │ + └─────┴─────┴─────┘ + """ + return self._compliant_frame.to_polars() # type: ignore[no-any-return] + @overload def write_csv(self, file: None = None) -> str: ... diff --git a/narwhals/series.py b/narwhals/series.py index 46ed53abf..dc80056c4 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -26,6 +26,7 @@ import numpy as np import pandas as pd + import polars as pl import pyarrow as pa from typing_extensions import Self @@ -2764,8 +2765,8 @@ def to_numpy(self) -> np.ndarray: """ return self._compliant_series.to_numpy() - def to_pandas(self) -> pd.Series: - """Convert to pandas. + def to_pandas(self: Self) -> pd.Series: + """Convert to pandas Series. Returns: A pandas Series containing the data from this Series. @@ -2811,6 +2812,62 @@ def to_pandas(self) -> pd.Series: """ return self._compliant_series.to_pandas() + def to_polars(self: Self) -> pl.Series: + """Convert to polars Series. + + Returns: + A polars Series containing the data from this Series. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeries + + >>> data = [1, 2, 3] + >>> s_pd = pd.Series(data, name="a") + >>> s_pl = pl.Series("a", data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_to_polars(s_native: IntoSeries) -> pd.Series: + ... s = nw.from_native(s_native, series_only=True) + ... return s.to_polars() + + We can then pass any supported library such as pandas, Polars, or + PyArrow to `agnostic_to_polars`: + + >>> agnostic_to_polars(s_pd) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: 'a' [i64] + [ + 1 + 2 + 3 + ] + + >>> agnostic_to_polars(s_pl) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: 'a' [i64] + [ + 1 + 2 + 3 + ] + + >>> agnostic_to_polars(s_pa) # doctest: +NORMALIZE_WHITESPACE + shape: (3,) + Series: '' [i64] + [ + 1 + 2 + 3 + ] + """ + return self._compliant_series.to_polars() # type: ignore[no-any-return] + def __add__(self, other: object) -> Self: return self._from_compliant_series( self._compliant_series.__add__(self._extract_native(other)) diff --git a/tests/frame/to_polars_test.py b/tests/frame/to_polars_test.py new file mode 100644 index 000000000..d8683cbd0 --- /dev/null +++ b/tests/frame/to_polars_test.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl +import pytest +from polars.testing import assert_frame_equal + +import narwhals.stable.v1 as nw + +if TYPE_CHECKING: + from tests.utils import ConstructorEager + + +@pytest.mark.filterwarnings("ignore:.*Passing a BlockManager.*:DeprecationWarning") +def test_convert_polars(constructor_eager: ConstructorEager) -> None: + data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.1, 8, 9]} + df_raw = constructor_eager(data) + result = nw.from_native(df_raw).to_polars() # type: ignore[union-attr] + + expected = pl.DataFrame(data) + + assert_frame_equal(result, expected) diff --git a/tests/series_only/to_polars_test.py b/tests/series_only/to_polars_test.py new file mode 100644 index 000000000..b5e66ea83 --- /dev/null +++ b/tests/series_only/to_polars_test.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl +from polars.testing import assert_series_equal + +import narwhals.stable.v1 as nw + +if TYPE_CHECKING: + from tests.utils import ConstructorEager + +data = [1, 3, 2] + + +def test_series_to_polars(constructor_eager: ConstructorEager) -> None: + result = ( + nw.from_native(constructor_eager({"a": data}), eager_only=True)["a"] + .alias("a") + .to_polars() + ) + + expected = pl.Series("a", data) + assert_series_equal(result, expected) diff --git a/utils/check_api_reference.py b/utils/check_api_reference.py index 2b85a2dc3..1b1867744 100644 --- a/utils/check_api_reference.py +++ b/utils/check_api_reference.py @@ -30,6 +30,7 @@ "to_native", "to_numpy", "to_pandas", + "to_polars", "value_counts", "zip_with", "__iter__",